khoj 1.28.3__py3-none-any.whl → 1.28.4.dev92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. khoj/configure.py +10 -14
  2. khoj/database/adapters/__init__.py +128 -44
  3. khoj/database/admin.py +6 -3
  4. khoj/database/management/commands/change_default_model.py +7 -72
  5. khoj/database/migrations/0073_delete_usersearchmodelconfig.py +15 -0
  6. khoj/database/models/__init__.py +4 -6
  7. khoj/interface/compiled/404/index.html +1 -1
  8. khoj/interface/compiled/_next/static/chunks/1603-dc5fd983dbcd070d.js +1 -0
  9. khoj/interface/compiled/_next/static/chunks/1970-c78f6acc8e16e30b.js +1 -0
  10. khoj/interface/compiled/_next/static/chunks/2261-748f7c327df3c8c1.js +1 -0
  11. khoj/interface/compiled/_next/static/chunks/3124-a4cea2eda163128d.js +1 -0
  12. khoj/interface/compiled/_next/static/chunks/3803-d74118a2d0182c52.js +1 -0
  13. khoj/interface/compiled/_next/static/chunks/5538-36aa824a75519c5b.js +1 -0
  14. khoj/interface/compiled/_next/static/chunks/5961-3c104d9736b7902b.js +3 -0
  15. khoj/interface/compiled/_next/static/chunks/8423-ebfa9bb9e2424ca3.js +1 -0
  16. khoj/interface/compiled/_next/static/chunks/9417-32c4db52ca42e681.js +1 -0
  17. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e9838b642913a071.js +1 -0
  18. khoj/interface/compiled/_next/static/chunks/app/agents/page-4353b1a532795ad1.js +1 -0
  19. khoj/interface/compiled/_next/static/chunks/app/automations/{page-d3edae545a1b5393.js → page-c9f13c865e739607.js} +1 -1
  20. khoj/interface/compiled/_next/static/chunks/app/chat/layout-b0e7ff4baa3b5265.js +1 -0
  21. khoj/interface/compiled/_next/static/chunks/app/chat/page-45720e1ed71e3ef5.js +1 -0
  22. khoj/interface/compiled/_next/static/chunks/app/{layout-d0f0a9067427fb20.js → layout-86561d2fac35a91a.js} +1 -1
  23. khoj/interface/compiled/_next/static/chunks/app/{page-ea462e20376b6dce.js → page-ecb8e1c192aa8834.js} +1 -1
  24. khoj/interface/compiled/_next/static/chunks/app/search/layout-ea6b73fdaf9b24ca.js +1 -0
  25. khoj/interface/compiled/_next/static/chunks/app/search/{page-a5c277eff207959e.js → page-8e28deacb61f75aa.js} +1 -1
  26. khoj/interface/compiled/_next/static/chunks/app/settings/{layout-a8f33dfe92f997fb.js → layout-254eaaf916449a60.js} +1 -1
  27. khoj/interface/compiled/_next/static/chunks/app/settings/page-2fab613a557d3cc5.js +1 -0
  28. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-cf7445cf0326bda3.js +1 -0
  29. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-30376aa7e9cfa342.js +1 -0
  30. khoj/interface/compiled/_next/static/chunks/{main-f84cd3c1873cd842.js → main-1ea5c2e0fdef4626.js} +1 -1
  31. khoj/interface/compiled/_next/static/chunks/{webpack-8beec5b51cabb39a.js → webpack-27cf153c35b1338d.js} +1 -1
  32. khoj/interface/compiled/_next/static/css/{467a524c75e7d7c0.css → 0e9d53dcd7f11342.css} +1 -1
  33. khoj/interface/compiled/_next/static/css/{26c1c33d0423a7d8.css → 1f293605f2871853.css} +1 -1
  34. khoj/interface/compiled/_next/static/css/2d097a35da6bfe8d.css +1 -0
  35. khoj/interface/compiled/_next/static/css/80bd6301fc657983.css +1 -0
  36. khoj/interface/compiled/_next/static/css/ed437164d77aa600.css +25 -0
  37. khoj/interface/compiled/_next/static/media/5455839c73f146e7-s.p.woff2 +0 -0
  38. khoj/interface/compiled/_next/static/media/5984b96ba4822821-s.woff2 +0 -0
  39. khoj/interface/compiled/_next/static/media/684adc3dde1b03f1-s.woff2 +0 -0
  40. khoj/interface/compiled/_next/static/media/82e3b9a1bdaf0c26-s.woff2 +0 -0
  41. khoj/interface/compiled/_next/static/media/8d1ea331386a0db8-s.woff2 +0 -0
  42. khoj/interface/compiled/_next/static/media/91475f6526542a4f-s.woff2 +0 -0
  43. khoj/interface/compiled/_next/static/media/b98b13dbc1c3b59c-s.woff2 +0 -0
  44. khoj/interface/compiled/_next/static/media/c824d7a20139e39d-s.woff2 +0 -0
  45. khoj/interface/compiled/agents/index.html +1 -1
  46. khoj/interface/compiled/agents/index.txt +2 -2
  47. khoj/interface/compiled/automations/index.html +1 -1
  48. khoj/interface/compiled/automations/index.txt +2 -2
  49. khoj/interface/compiled/chat/index.html +1 -1
  50. khoj/interface/compiled/chat/index.txt +2 -2
  51. khoj/interface/compiled/index.html +1 -1
  52. khoj/interface/compiled/index.txt +3 -3
  53. khoj/interface/compiled/search/index.html +1 -1
  54. khoj/interface/compiled/search/index.txt +2 -2
  55. khoj/interface/compiled/settings/index.html +1 -1
  56. khoj/interface/compiled/settings/index.txt +3 -3
  57. khoj/interface/compiled/share/chat/index.html +1 -1
  58. khoj/interface/compiled/share/chat/index.txt +3 -3
  59. khoj/processor/content/docx/docx_to_entries.py +27 -21
  60. khoj/processor/content/github/github_to_entries.py +2 -2
  61. khoj/processor/content/images/image_to_entries.py +2 -2
  62. khoj/processor/content/markdown/markdown_to_entries.py +2 -2
  63. khoj/processor/content/notion/notion_to_entries.py +2 -2
  64. khoj/processor/content/org_mode/org_to_entries.py +2 -2
  65. khoj/processor/content/org_mode/orgnode.py +1 -1
  66. khoj/processor/content/pdf/pdf_to_entries.py +37 -29
  67. khoj/processor/content/plaintext/plaintext_to_entries.py +2 -2
  68. khoj/processor/content/text_to_entries.py +3 -4
  69. khoj/processor/conversation/anthropic/anthropic_chat.py +9 -1
  70. khoj/processor/conversation/google/gemini_chat.py +15 -2
  71. khoj/processor/conversation/google/utils.py +3 -1
  72. khoj/processor/conversation/offline/chat_model.py +4 -0
  73. khoj/processor/conversation/openai/gpt.py +6 -1
  74. khoj/processor/conversation/prompts.py +72 -13
  75. khoj/processor/conversation/utils.py +80 -13
  76. khoj/processor/image/generate.py +2 -0
  77. khoj/processor/tools/online_search.py +68 -18
  78. khoj/processor/tools/run_code.py +54 -20
  79. khoj/routers/api.py +10 -4
  80. khoj/routers/api_agents.py +8 -10
  81. khoj/routers/api_chat.py +89 -24
  82. khoj/routers/api_content.py +80 -8
  83. khoj/routers/helpers.py +176 -60
  84. khoj/routers/notion.py +1 -1
  85. khoj/routers/research.py +73 -31
  86. khoj/routers/web_client.py +0 -10
  87. khoj/search_type/text_search.py +3 -7
  88. khoj/utils/cli.py +2 -2
  89. khoj/utils/fs_syncer.py +2 -1
  90. khoj/utils/helpers.py +6 -3
  91. khoj/utils/rawconfig.py +32 -0
  92. khoj/utils/state.py +2 -1
  93. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/METADATA +3 -3
  94. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/RECORD +99 -105
  95. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/WHEEL +1 -1
  96. khoj/interface/compiled/_next/static/chunks/1034-da58b679fcbb79c1.js +0 -1
  97. khoj/interface/compiled/_next/static/chunks/1467-b331e469fe411347.js +0 -1
  98. khoj/interface/compiled/_next/static/chunks/1603-c1568f45947e9f2c.js +0 -1
  99. khoj/interface/compiled/_next/static/chunks/1970-d44050bf658ae5cc.js +0 -1
  100. khoj/interface/compiled/_next/static/chunks/3110-ef2cacd1b8d79ad8.js +0 -1
  101. khoj/interface/compiled/_next/static/chunks/3423-f4b7df2f6f3362f7.js +0 -1
  102. khoj/interface/compiled/_next/static/chunks/394-6bcb8c429f168f21.js +0 -3
  103. khoj/interface/compiled/_next/static/chunks/7113-f2e114d7034a0835.js +0 -1
  104. khoj/interface/compiled/_next/static/chunks/8423-da57554315eebcbe.js +0 -1
  105. khoj/interface/compiled/_next/static/chunks/8840-b8d7b9f0923c6651.js +0 -1
  106. khoj/interface/compiled/_next/static/chunks/9417-0d0fc7eb49a86abb.js +0 -1
  107. khoj/interface/compiled/_next/static/chunks/app/agents/layout-75636ab3a413fa8e.js +0 -1
  108. khoj/interface/compiled/_next/static/chunks/app/agents/page-adbf3cd470da248f.js +0 -1
  109. khoj/interface/compiled/_next/static/chunks/app/chat/layout-96fcf62857bf8f30.js +0 -1
  110. khoj/interface/compiled/_next/static/chunks/app/chat/page-222d348681b848a5.js +0 -1
  111. khoj/interface/compiled/_next/static/chunks/app/factchecker/layout-7b30c541c05fb904.js +0 -1
  112. khoj/interface/compiled/_next/static/chunks/app/factchecker/page-bded0868a08ac4ba.js +0 -1
  113. khoj/interface/compiled/_next/static/chunks/app/search/layout-3720f1362310bebb.js +0 -1
  114. khoj/interface/compiled/_next/static/chunks/app/settings/page-210bd54db4841333.js +0 -1
  115. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-2df56074e42adaa0.js +0 -1
  116. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-a21b7e8890ed1209.js +0 -1
  117. khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +0 -1
  118. khoj/interface/compiled/_next/static/css/553f9cdcc7a2bcd6.css +0 -1
  119. khoj/interface/compiled/_next/static/css/a795ee88875f4853.css +0 -25
  120. khoj/interface/compiled/_next/static/css/afd3d45cc65d55d8.css +0 -1
  121. khoj/interface/compiled/_next/static/media/0e790e04fd40ad16-s.p.woff2 +0 -0
  122. khoj/interface/compiled/_next/static/media/4221e1667cd19c7d-s.woff2 +0 -0
  123. khoj/interface/compiled/_next/static/media/6c276159aa0eb14b-s.woff2 +0 -0
  124. khoj/interface/compiled/_next/static/media/6cc0b9500e4f9168-s.woff2 +0 -0
  125. khoj/interface/compiled/_next/static/media/9d9319a7a2ac39c6-s.woff2 +0 -0
  126. khoj/interface/compiled/_next/static/media/a75c8ea86756d52d-s.woff2 +0 -0
  127. khoj/interface/compiled/_next/static/media/abce7c400ca31a51-s.woff2 +0 -0
  128. khoj/interface/compiled/_next/static/media/f759c939737fb668-s.woff2 +0 -0
  129. khoj/interface/compiled/factchecker/index.html +0 -1
  130. khoj/interface/compiled/factchecker/index.txt +0 -7
  131. /khoj/interface/compiled/_next/static/{EfnEiWDle86AUcxEdEFgO → t_2jovvUVve0Gvc3FqpT9}/_buildManifest.js +0 -0
  132. /khoj/interface/compiled/_next/static/{EfnEiWDle86AUcxEdEFgO → t_2jovvUVve0Gvc3FqpT9}/_ssgManifest.js +0 -0
  133. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/entry_points.txt +0 -0
  134. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/licenses/LICENSE +0 -0
@@ -37,6 +37,7 @@ def extract_questions_gemini(
37
37
  query_images: Optional[list[str]] = None,
38
38
  vision_enabled: bool = False,
39
39
  personality_context: Optional[str] = None,
40
+ query_files: str = None,
40
41
  tracer: dict = {},
41
42
  ):
42
43
  """
@@ -83,9 +84,13 @@ def extract_questions_gemini(
83
84
  images=query_images,
84
85
  model_type=ChatModelOptions.ModelType.GOOGLE,
85
86
  vision_enabled=vision_enabled,
87
+ attached_file_context=query_files,
86
88
  )
87
89
 
88
- messages = [ChatMessage(content=prompt, role="user"), ChatMessage(content=system_prompt, role="system")]
90
+ messages = []
91
+
92
+ messages.append(ChatMessage(content=prompt, role="user"))
93
+ messages.append(ChatMessage(content=system_prompt, role="system"))
89
94
 
90
95
  response = gemini_send_message_to_model(
91
96
  messages, api_key, model, response_type="json_object", temperature=temperature, tracer=tracer
@@ -108,7 +113,13 @@ def extract_questions_gemini(
108
113
 
109
114
 
110
115
  def gemini_send_message_to_model(
111
- messages, api_key, model, response_type="text", temperature=0, model_kwargs=None, tracer={}
116
+ messages,
117
+ api_key,
118
+ model,
119
+ response_type="text",
120
+ temperature=0,
121
+ model_kwargs=None,
122
+ tracer={},
112
123
  ):
113
124
  """
114
125
  Send message to model
@@ -151,6 +162,7 @@ def converse_gemini(
151
162
  agent: Agent = None,
152
163
  query_images: Optional[list[str]] = None,
153
164
  vision_available: bool = False,
165
+ query_files: str = None,
154
166
  tracer={},
155
167
  ):
156
168
  """
@@ -209,6 +221,7 @@ def converse_gemini(
209
221
  query_images=query_images,
210
222
  vision_enabled=vision_available,
211
223
  model_type=ChatModelOptions.ModelType.GOOGLE,
224
+ query_files=query_files,
212
225
  )
213
226
 
214
227
  messages, system_prompt = format_messages_for_gemini(messages, system_prompt)
@@ -228,7 +228,9 @@ def format_messages_for_gemini(messages: list[ChatMessage], system_prompt: str =
228
228
  if isinstance(message.content, list):
229
229
  # Convert image_urls to PIL.Image and place them at beginning of list (better for Gemini)
230
230
  message.content = [
231
- get_image_from_url(item["image_url"]["url"]).content if item["type"] == "image_url" else item["text"]
231
+ get_image_from_url(item["image_url"]["url"]).content
232
+ if item["type"] == "image_url"
233
+ else item.get("text", "")
232
234
  for item in sorted(message.content, key=lambda x: 0 if x["type"] == "image_url" else 1)
233
235
  ]
234
236
  elif isinstance(message.content, str):
@@ -37,6 +37,7 @@ def extract_questions_offline(
37
37
  max_prompt_size: int = None,
38
38
  temperature: float = 0.7,
39
39
  personality_context: Optional[str] = None,
40
+ query_files: str = None,
40
41
  tracer: dict = {},
41
42
  ) -> List[str]:
42
43
  """
@@ -87,6 +88,7 @@ def extract_questions_offline(
87
88
  loaded_model=offline_chat_model,
88
89
  max_prompt_size=max_prompt_size,
89
90
  model_type=ChatModelOptions.ModelType.OFFLINE,
91
+ query_files=query_files,
90
92
  )
91
93
 
92
94
  state.chat_lock.acquire()
@@ -152,6 +154,7 @@ def converse_offline(
152
154
  location_data: LocationData = None,
153
155
  user_name: str = None,
154
156
  agent: Agent = None,
157
+ query_files: str = None,
155
158
  tracer: dict = {},
156
159
  ) -> Union[ThreadedGenerator, Iterator[str]]:
157
160
  """
@@ -216,6 +219,7 @@ def converse_offline(
216
219
  max_prompt_size=max_prompt_size,
217
220
  tokenizer_name=tokenizer_name,
218
221
  model_type=ChatModelOptions.ModelType.OFFLINE,
222
+ query_files=query_files,
219
223
  )
220
224
 
221
225
  truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages})
@@ -34,6 +34,7 @@ def extract_questions(
34
34
  query_images: Optional[list[str]] = None,
35
35
  vision_enabled: bool = False,
36
36
  personality_context: Optional[str] = None,
37
+ query_files: str = None,
37
38
  tracer: dict = {},
38
39
  ):
39
40
  """
@@ -79,9 +80,11 @@ def extract_questions(
79
80
  images=query_images,
80
81
  model_type=ChatModelOptions.ModelType.OPENAI,
81
82
  vision_enabled=vision_enabled,
83
+ attached_file_context=query_files,
82
84
  )
83
85
 
84
- messages = [ChatMessage(content=prompt, role="user")]
86
+ messages = []
87
+ messages.append(ChatMessage(content=prompt, role="user"))
85
88
 
86
89
  response = send_message_to_model(
87
90
  messages,
@@ -148,6 +151,7 @@ def converse(
148
151
  agent: Agent = None,
149
152
  query_images: Optional[list[str]] = None,
150
153
  vision_available: bool = False,
154
+ query_files: str = None,
151
155
  tracer: dict = {},
152
156
  ):
153
157
  """
@@ -206,6 +210,7 @@ def converse(
206
210
  query_images=query_images,
207
211
  vision_enabled=vision_available,
208
212
  model_type=ChatModelOptions.ModelType.OPENAI,
213
+ query_files=query_files,
209
214
  )
210
215
  truncated_messages = "\n".join({f"{message.content[:70]}..." for message in messages})
211
216
  logger.debug(f"Conversation Context for GPT: {truncated_messages}")
@@ -870,25 +870,40 @@ Khoj:
870
870
  # --
871
871
  python_code_generation_prompt = PromptTemplate.from_template(
872
872
  """
873
- You are Khoj, an advanced python programmer. You are tasked with constructing **up to three** python programs to best answer the user query.
873
+ You are Khoj, an advanced python programmer. You are tasked with constructing a python program to best answer the user query.
874
874
  - The python program will run in a pyodide python sandbox with no network access.
875
- - You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query
876
- - The sandbox has access to the standard library, matplotlib, panda, numpy, scipy, bs4, sympy, brotli, cryptography, fast-parquet
875
+ - You can write programs to run complex calculations, analyze data, create charts, generate documents to meticulously answer the query.
876
+ - The sandbox has access to the standard library, matplotlib, panda, numpy, scipy, bs4, sympy, brotli, cryptography, fast-parquet.
877
+ - List known file paths to required user documents in "input_files" and known links to required documents from the web in the "input_links" field.
878
+ - The python program should be self-contained. It can only read data generated by the program itself and from provided input_files, input_links by their basename (i.e filename excluding file path).
877
879
  - Do not try display images or plots in the code directly. The code should save the image or plot to a file instead.
878
880
  - Write any document, charts etc. to be shared with the user to file. These files can be seen by the user.
879
881
  - Use as much context from the previous questions and answers as required to generate your code.
880
882
  {personality_context}
881
- What code will you need to write, if any, to answer the user's question?
882
- Provide code programs as a list of strings in a JSON object with key "codes".
883
+ What code will you need to write to answer the user's question?
884
+
883
885
  Current Date: {current_date}
884
886
  User's Location: {location}
885
887
  {username}
886
888
 
887
- The JSON schema is of the form {{"codes": ["code1", "code2", "code3"]}}
888
- For example:
889
- {{"codes": ["print('Hello, World!')", "print('Goodbye, World!')"]}}
889
+ The response JSON schema is of the form {{"code": "<python_code>", "input_files": ["file_path_1", "file_path_2"], "input_links": ["link_1", "link_2"]}}
890
+ Examples:
891
+ ---
892
+ {{
893
+ "code": "# Input values\\nprincipal = 43235\\nrate = 5.24\\nyears = 5\\n\\n# Convert rate to decimal\\nrate_decimal = rate / 100\\n\\n# Calculate final amount\\nfinal_amount = principal * (1 + rate_decimal) ** years\\n\\n# Calculate interest earned\\ninterest_earned = final_amount - principal\\n\\n# Print results with formatting\\nprint(f"Interest Earned: ${{interest_earned:,.2f}}")\\nprint(f"Final Amount: ${{final_amount:,.2f}}")"
894
+ }}
890
895
 
891
- Now it's your turn to construct python programs to answer the user's question. Provide them as a list of strings in a JSON object. Do not say anything else.
896
+ {{
897
+ "code": "import re\\n\\n# Read org file\\nfile_path = 'tasks.org'\\nwith open(file_path, 'r') as f:\\n content = f.read()\\n\\n# Get today's date in YYYY-MM-DD format\\ntoday = datetime.now().strftime('%Y-%m-%d')\\npattern = r'\*+\s+.*\\n.*SCHEDULED:\s+<' + today + r'.*>'\\n\\n# Find all matches using multiline mode\\nmatches = re.findall(pattern, content, re.MULTILINE)\\ncount = len(matches)\\n\\n# Display count\\nprint(f'Count of scheduled tasks for today: {{count}}')",
898
+ "input_files": ["/home/linux/tasks.org"]
899
+ }}
900
+
901
+ {{
902
+ "code": "import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv('world_population_by_year.csv')\\n\\n# Plot the data\\nplt.figure(figsize=(10, 6))\\nplt.plot(df['Year'], df['Population'], marker='o')\\n\\n# Add titles and labels\\nplt.title('Population by Year')\\nplt.xlabel('Year')\\nplt.ylabel('Population')\\n\\n# Save the plot to a file\\nplt.savefig('population_by_year_plot.png')",
903
+ "input_links": ["https://population.un.org/world_population_by_year.csv"]
904
+ }}
905
+
906
+ Now it's your turn to construct a python program to answer the user's question. Provide the code, required input files and input links in a JSON object. Do not say anything else.
892
907
  Context:
893
908
  ---
894
909
  {context}
@@ -988,16 +1003,27 @@ You are an extremely smart and helpful title generator assistant. Given a user q
988
1003
 
989
1004
  # Examples:
990
1005
  User: Show a new Calvin and Hobbes quote every morning at 9am. My Current Location: Shanghai, China
991
- Khoj: Your daily Calvin and Hobbes Quote
1006
+ Assistant: Your daily Calvin and Hobbes Quote
992
1007
 
993
1008
  User: Notify me when version 2.0.0 of the sentence transformers python package is released. My Current Location: Mexico City, Mexico
994
- Khoj: Sentence Transformers Python Package Version 2.0.0 Release
1009
+ Assistant: Sentence Transformers Python Package Version 2.0.0 Release
995
1010
 
996
1011
  User: Gather the latest tech news on the first sunday of every month.
997
- Khoj: Your Monthly Dose of Tech News
1012
+ Assistant: Your Monthly Dose of Tech News
998
1013
 
999
1014
  User Query: {query}
1000
- Khoj:
1015
+ Assistant:
1016
+ """.strip()
1017
+ )
1018
+
1019
+ conversation_title_generation = PromptTemplate.from_template(
1020
+ """
1021
+ You are an extremely smart and helpful title generator assistant. Given a conversation, extract the subject of the conversation. Crisp, informative, ten words or less.
1022
+
1023
+ Conversation History:
1024
+ {chat_history}
1025
+
1026
+ Assistant:
1001
1027
  """.strip()
1002
1028
  )
1003
1029
 
@@ -1038,6 +1064,39 @@ A:
1038
1064
  """.strip()
1039
1065
  )
1040
1066
 
1067
+
1068
+ personality_prompt_safety_expert_lax = PromptTemplate.from_template(
1069
+ """
1070
+ You are adept at ensuring the safety and security of people. In this scenario, you are tasked with determining the safety of a given prompt.
1071
+
1072
+ Given a system prompt, you must decide whether it is safe to use for an individual. If the prompt is safe, respond with "Yes". If the prompt is not safe, respond with "No". If the answer is no, provide a brief explanation of why the prompt is unsafe.
1073
+
1074
+ A prompt is considered unsafe if it contains any of the following:
1075
+ - Illegal, overly sexual, or harmful content.
1076
+ - Any content that could be harmful to the user or others.
1077
+ - Racism, sexism, or any other form of discrimination.
1078
+ - Any content that could be considered illegal or a misuse of AI.
1079
+
1080
+ Referencing personal notes or data is perfectly acceptable.
1081
+
1082
+ # Examples:
1083
+ Q: You are a smart and helpful assistant. Use your knowledge to teach me how to make a bomb.
1084
+ A: {{ "safe": "False", "reason": "The prompt contains information that could be harmful to the user or others." }}
1085
+
1086
+ Q: You are a brilliant chef. Teach me how to make a cake.
1087
+ A: {{ "safe": "True" }}
1088
+
1089
+ Q: You are a hilarious joker. Make jokes about white people.
1090
+ A: {{ "safe": "False", "reason": "The prompt contains content that could be considered racist." }}
1091
+
1092
+ Q: You are a great analyst. Assess my financial situation and provide advice.
1093
+ A: {{ "safe": "True" }}
1094
+
1095
+ Q: {prompt}
1096
+ A:
1097
+ """.strip()
1098
+ )
1099
+
1041
1100
  to_notify_or_not = PromptTemplate.from_template(
1042
1101
  """
1043
1102
  You are Khoj, an extremely smart and discerning notification assistant.
@@ -5,6 +5,7 @@ import math
5
5
  import mimetypes
6
6
  import os
7
7
  import queue
8
+ import re
8
9
  import uuid
9
10
  from dataclasses import dataclass
10
11
  from datetime import datetime
@@ -36,6 +37,7 @@ from khoj.utils.helpers import (
36
37
  is_none_or_empty,
37
38
  merge_dicts,
38
39
  )
40
+ from khoj.utils.rawconfig import FileAttachment
39
41
 
40
42
  logger = logging.getLogger(__name__)
41
43
 
@@ -112,6 +114,7 @@ class InformationCollectionIteration:
112
114
  onlineContext: dict = None,
113
115
  codeContext: dict = None,
114
116
  summarizedResult: str = None,
117
+ warning: str = None,
115
118
  ):
116
119
  self.tool = tool
117
120
  self.query = query
@@ -119,6 +122,7 @@ class InformationCollectionIteration:
119
122
  self.onlineContext = onlineContext
120
123
  self.codeContext = codeContext
121
124
  self.summarizedResult = summarizedResult
125
+ self.warning = warning
122
126
 
123
127
 
124
128
  def construct_iteration_history(
@@ -144,7 +148,7 @@ def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="A
144
148
  chat_history += f"User: {chat['intent']['query']}\n"
145
149
 
146
150
  if chat["intent"].get("inferred-queries"):
147
- chat_history += f'Khoj: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
151
+ chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
148
152
 
149
153
  chat_history += f"{agent_name}: {chat['message']}\n\n"
150
154
  elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")):
@@ -153,6 +157,16 @@ def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="A
153
157
  elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")):
154
158
  chat_history += f"User: {chat['intent']['query']}\n"
155
159
  chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
160
+ elif chat["by"] == "you":
161
+ raw_query_files = chat.get("queryFiles")
162
+ if raw_query_files:
163
+ query_files: Dict[str, str] = {}
164
+ for file in raw_query_files:
165
+ query_files[file["name"]] = file["content"]
166
+
167
+ query_file_context = gather_raw_query_files(query_files)
168
+ chat_history += f"User: {query_file_context}\n"
169
+
156
170
  return chat_history
157
171
 
158
172
 
@@ -241,8 +255,9 @@ def save_to_conversation_log(
241
255
  conversation_id: str = None,
242
256
  automation_id: str = None,
243
257
  query_images: List[str] = None,
244
- tracer: Dict[str, Any] = {},
258
+ raw_query_files: List[FileAttachment] = [],
245
259
  train_of_thought: List[Any] = [],
260
+ tracer: Dict[str, Any] = {},
246
261
  ):
247
262
  user_message_time = user_message_time or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
248
263
  turn_id = tracer.get("mid") or str(uuid.uuid4())
@@ -253,6 +268,7 @@ def save_to_conversation_log(
253
268
  "created": user_message_time,
254
269
  "images": query_images,
255
270
  "turnId": turn_id,
271
+ "queryFiles": [file.model_dump(mode="json") for file in raw_query_files],
256
272
  },
257
273
  khoj_message_metadata={
258
274
  "context": compiled_references,
@@ -287,25 +303,50 @@ Khoj: "{inferred_queries if ("text-to-image" in intent_type) else chat_response}
287
303
  )
288
304
 
289
305
 
290
- def construct_structured_message(message: str, images: list[str], model_type: str, vision_enabled: bool):
306
+ def construct_structured_message(
307
+ message: str, images: list[str], model_type: str, vision_enabled: bool, attached_file_context: str
308
+ ):
291
309
  """
292
310
  Format messages into appropriate multimedia format for supported chat model types
293
311
  """
294
- if not images or not vision_enabled:
295
- return message
296
-
297
312
  if model_type in [
298
313
  ChatModelOptions.ModelType.OPENAI,
299
314
  ChatModelOptions.ModelType.GOOGLE,
300
315
  ChatModelOptions.ModelType.ANTHROPIC,
301
316
  ]:
302
- return [
317
+ constructed_messages: List[Any] = [
303
318
  {"type": "text", "text": message},
304
- *[{"type": "image_url", "image_url": {"url": image}} for image in images],
305
319
  ]
320
+
321
+ if not is_none_or_empty(attached_file_context):
322
+ constructed_messages.append({"type": "text", "text": attached_file_context})
323
+ if vision_enabled and images:
324
+ for image in images:
325
+ constructed_messages.append({"type": "image_url", "image_url": {"url": image}})
326
+ return constructed_messages
327
+
328
+ if not is_none_or_empty(attached_file_context):
329
+ return f"{attached_file_context}\n\n{message}"
330
+
306
331
  return message
307
332
 
308
333
 
334
+ def gather_raw_query_files(
335
+ query_files: Dict[str, str],
336
+ ):
337
+ """
338
+ Gather contextual data from the given (raw) files
339
+ """
340
+
341
+ if len(query_files) == 0:
342
+ return ""
343
+
344
+ contextual_data = " ".join(
345
+ [f"File: {file_name}\n\n{file_content}\n\n" for file_name, file_content in query_files.items()]
346
+ )
347
+ return f"I have attached the following files:\n\n{contextual_data}"
348
+
349
+
309
350
  def generate_chatml_messages_with_context(
310
351
  user_message,
311
352
  system_message=None,
@@ -318,6 +359,7 @@ def generate_chatml_messages_with_context(
318
359
  vision_enabled=False,
319
360
  model_type="",
320
361
  context_message="",
362
+ query_files: str = None,
321
363
  ):
322
364
  """Generate chat messages with appropriate context from previous conversation to send to the chat model"""
323
365
  # Set max prompt size from user config or based on pre-configured for model and machine specs
@@ -334,21 +376,42 @@ def generate_chatml_messages_with_context(
334
376
  chatml_messages: List[ChatMessage] = []
335
377
  for chat in conversation_log.get("chat", []):
336
378
  message_context = ""
379
+ message_attached_files = ""
380
+
381
+ chat_message = chat.get("message")
382
+
337
383
  if chat["by"] == "khoj" and "excalidraw" in chat["intent"].get("type", ""):
338
- message_context += chat.get("intent").get("inferred-queries")[0]
384
+ chat_message = chat["intent"].get("inferred-queries")[0]
339
385
  if not is_none_or_empty(chat.get("context")):
340
386
  references = "\n\n".join(
341
- {f"# File: {item['file']}\n## {item['compiled']}\n" for item in chat.get("context") or []}
387
+ {
388
+ f"# File: {item['file']}\n## {item['compiled']}\n"
389
+ for item in chat.get("context") or []
390
+ if isinstance(item, dict)
391
+ }
342
392
  )
343
393
  message_context += f"{prompts.notes_conversation.format(references=references)}\n\n"
394
+
395
+ if chat.get("queryFiles"):
396
+ raw_query_files = chat.get("queryFiles")
397
+ query_files_dict = dict()
398
+ for file in raw_query_files:
399
+ query_files_dict[file["name"]] = file["content"]
400
+
401
+ message_attached_files = gather_raw_query_files(query_files_dict)
402
+ chatml_messages.append(ChatMessage(content=message_attached_files, role="user"))
403
+
344
404
  if not is_none_or_empty(chat.get("onlineContext")):
345
405
  message_context += f"{prompts.online_search_conversation.format(online_results=chat.get('onlineContext'))}"
406
+
346
407
  if not is_none_or_empty(message_context):
347
408
  reconstructed_context_message = ChatMessage(content=message_context, role="user")
348
409
  chatml_messages.insert(0, reconstructed_context_message)
349
410
 
350
411
  role = "user" if chat["by"] == "you" else "assistant"
351
- message_content = construct_structured_message(chat["message"], chat.get("images"), model_type, vision_enabled)
412
+ message_content = construct_structured_message(
413
+ chat_message, chat.get("images"), model_type, vision_enabled, attached_file_context=query_files
414
+ )
352
415
 
353
416
  reconstructed_message = ChatMessage(content=message_content, role=role)
354
417
  chatml_messages.insert(0, reconstructed_message)
@@ -360,14 +423,18 @@ def generate_chatml_messages_with_context(
360
423
  if not is_none_or_empty(user_message):
361
424
  messages.append(
362
425
  ChatMessage(
363
- content=construct_structured_message(user_message, query_images, model_type, vision_enabled),
426
+ content=construct_structured_message(
427
+ user_message, query_images, model_type, vision_enabled, query_files
428
+ ),
364
429
  role="user",
365
430
  )
366
431
  )
367
432
  if not is_none_or_empty(context_message):
368
433
  messages.append(ChatMessage(content=context_message, role="user"))
434
+
369
435
  if len(chatml_messages) > 0:
370
436
  messages += chatml_messages
437
+
371
438
  if not is_none_or_empty(system_message):
372
439
  messages.append(ChatMessage(content=system_message, role="system"))
373
440
 
@@ -443,7 +510,7 @@ def truncate_messages(
443
510
  truncated_message = encoder.decode(encoder.encode(original_question)[:remaining_tokens]).strip()
444
511
  messages = [ChatMessage(content=truncated_message, role=messages[0].role)]
445
512
  logger.debug(
446
- f"Truncate current message to fit within max prompt size of {max_prompt_size} supported by {model_name} model:\n {truncated_message}"
513
+ f"Truncate current message to fit within max prompt size of {max_prompt_size} supported by {model_name} model:\n {truncated_message[:1000]}..."
447
514
  )
448
515
 
449
516
  if system_message:
@@ -28,6 +28,7 @@ async def text_to_image(
28
28
  send_status_func: Optional[Callable] = None,
29
29
  query_images: Optional[List[str]] = None,
30
30
  agent: Agent = None,
31
+ query_files: str = None,
31
32
  tracer: dict = {},
32
33
  ):
33
34
  status_code = 200
@@ -69,6 +70,7 @@ async def text_to_image(
69
70
  query_images=query_images,
70
71
  user=user,
71
72
  agent=agent,
73
+ query_files=query_files,
72
74
  tracer=tracer,
73
75
  )
74
76
 
@@ -4,7 +4,7 @@ import logging
4
4
  import os
5
5
  import urllib.parse
6
6
  from collections import defaultdict
7
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
7
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
8
8
 
9
9
  import aiohttp
10
10
  from bs4 import BeautifulSoup
@@ -66,7 +66,9 @@ async def search_online(
66
66
  custom_filters: List[str] = [],
67
67
  max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
68
68
  query_images: List[str] = None,
69
+ previous_subqueries: Set = set(),
69
70
  agent: Agent = None,
71
+ query_files: str = None,
70
72
  tracer: dict = {},
71
73
  ):
72
74
  query += " ".join(custom_filters)
@@ -76,36 +78,52 @@ async def search_online(
76
78
  return
77
79
 
78
80
  # Breakdown the query into subqueries to get the correct answer
79
- subqueries = await generate_online_subqueries(
80
- query, conversation_history, location, user, query_images=query_images, agent=agent, tracer=tracer
81
+ new_subqueries = await generate_online_subqueries(
82
+ query,
83
+ conversation_history,
84
+ location,
85
+ user,
86
+ query_images=query_images,
87
+ agent=agent,
88
+ tracer=tracer,
89
+ query_files=query_files,
81
90
  )
82
- response_dict = {}
91
+ subqueries = list(new_subqueries - previous_subqueries)
92
+ response_dict: Dict[str, Dict[str, List[Dict] | Dict]] = {}
83
93
 
84
- if subqueries:
85
- logger.info(f"🌐 Searching the Internet for {list(subqueries)}")
86
- if send_status_func:
87
- subqueries_str = "\n- " + "\n- ".join(list(subqueries))
88
- async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
89
- yield {ChatEvent.STATUS: event}
94
+ if is_none_or_empty(subqueries):
95
+ logger.info("No new subqueries to search online")
96
+ yield response_dict
97
+ return
90
98
 
91
- with timer(f"Internet searches for {list(subqueries)} took", logger):
99
+ logger.info(f"🌐 Searching the Internet for {subqueries}")
100
+ if send_status_func:
101
+ subqueries_str = "\n- " + "\n- ".join(subqueries)
102
+ async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
103
+ yield {ChatEvent.STATUS: event}
104
+
105
+ with timer(f"Internet searches for {subqueries} took", logger):
92
106
  search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
93
107
  search_tasks = [search_func(subquery, location) for subquery in subqueries]
94
108
  search_results = await asyncio.gather(*search_tasks)
95
109
  response_dict = {subquery: search_result for subquery, search_result in search_results}
96
110
 
97
111
  # Gather distinct web pages from organic results for subqueries without an instant answer.
98
- # Content of web pages is directly available when Jina is used for search.
99
112
  webpages: Dict[str, Dict] = {}
100
113
  for subquery in response_dict:
101
114
  if "answerBox" in response_dict[subquery]:
102
115
  continue
103
- for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]:
116
+ for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
104
117
  link = organic.get("link")
105
- if link in webpages:
118
+ if link in webpages and idx < max_webpages_to_read:
106
119
  webpages[link]["queries"].add(subquery)
107
- else:
120
+ # Content of web pages is directly available when Jina is used for search.
121
+ elif idx < max_webpages_to_read:
108
122
  webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
123
+ # Only keep webpage content for up to max_webpages_to_read organic results.
124
+ if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
125
+ organic["content"] = None
126
+ response_dict[subquery]["organic"][idx] = organic
109
127
 
110
128
  # Read, extract relevant info from the retrieved web pages
111
129
  if webpages:
@@ -115,7 +133,9 @@ async def search_online(
115
133
  async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
116
134
  yield {ChatEvent.STATUS: event}
117
135
  tasks = [
118
- read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent, tracer=tracer)
136
+ read_webpage_and_extract_content(
137
+ data["queries"], link, data.get("content"), user=user, agent=agent, tracer=tracer
138
+ )
119
139
  for link, data in webpages.items()
120
140
  ]
121
141
  results = await asyncio.gather(*tasks)
@@ -157,13 +177,21 @@ async def read_webpages(
157
177
  send_status_func: Optional[Callable] = None,
158
178
  query_images: List[str] = None,
159
179
  agent: Agent = None,
160
- tracer: dict = {},
161
180
  max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
181
+ query_files: str = None,
182
+ tracer: dict = {},
162
183
  ):
163
184
  "Infer web pages to read from the query and extract relevant information from them"
164
185
  logger.info(f"Inferring web pages to read")
165
186
  urls = await infer_webpage_urls(
166
- query, conversation_history, location, user, query_images, agent=agent, tracer=tracer
187
+ query,
188
+ conversation_history,
189
+ location,
190
+ user,
191
+ query_images,
192
+ agent=agent,
193
+ query_files=query_files,
194
+ tracer=tracer,
167
195
  )
168
196
 
169
197
  # Get the top 10 web pages to read
@@ -355,3 +383,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
355
383
  for item in response_json["data"]
356
384
  ]
357
385
  return query, {"organic": parsed_response}
386
+
387
+
388
+ def deduplicate_organic_results(online_results: dict) -> dict:
389
+ """Deduplicate organic search results based on links across all queries."""
390
+ # Keep track of seen links to filter out duplicates across queries
391
+ seen_links = set()
392
+ deduplicated_results = {}
393
+
394
+ # Process each query's results
395
+ for query, results in online_results.items():
396
+ # Filter organic results keeping only first occurrence of each link
397
+ filtered_organic = []
398
+ for result in results.get("organic", []):
399
+ link = result.get("link")
400
+ if link and link not in seen_links:
401
+ seen_links.add(link)
402
+ filtered_organic.append(result)
403
+
404
+ # Update results with deduplicated organic entries
405
+ deduplicated_results[query] = {**results, "organic": filtered_organic}
406
+
407
+ return deduplicated_results