khoj 1.16.1.dev15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. khoj/__init__.py +0 -0
  2. khoj/app/README.md +94 -0
  3. khoj/app/__init__.py +0 -0
  4. khoj/app/asgi.py +16 -0
  5. khoj/app/settings.py +192 -0
  6. khoj/app/urls.py +25 -0
  7. khoj/configure.py +424 -0
  8. khoj/database/__init__.py +0 -0
  9. khoj/database/adapters/__init__.py +1234 -0
  10. khoj/database/admin.py +290 -0
  11. khoj/database/apps.py +6 -0
  12. khoj/database/management/__init__.py +0 -0
  13. khoj/database/management/commands/__init__.py +0 -0
  14. khoj/database/management/commands/change_generated_images_url.py +61 -0
  15. khoj/database/management/commands/convert_images_png_to_webp.py +99 -0
  16. khoj/database/migrations/0001_khojuser.py +98 -0
  17. khoj/database/migrations/0002_googleuser.py +32 -0
  18. khoj/database/migrations/0003_vector_extension.py +10 -0
  19. khoj/database/migrations/0004_content_types_and_more.py +181 -0
  20. khoj/database/migrations/0005_embeddings_corpus_id.py +19 -0
  21. khoj/database/migrations/0006_embeddingsdates.py +33 -0
  22. khoj/database/migrations/0007_add_conversation.py +27 -0
  23. khoj/database/migrations/0008_alter_conversation_conversation_log.py +17 -0
  24. khoj/database/migrations/0009_khojapiuser.py +24 -0
  25. khoj/database/migrations/0010_chatmodeloptions_and_more.py +83 -0
  26. khoj/database/migrations/0010_rename_embeddings_entry_and_more.py +30 -0
  27. khoj/database/migrations/0011_merge_20231102_0138.py +14 -0
  28. khoj/database/migrations/0012_entry_file_source.py +21 -0
  29. khoj/database/migrations/0013_subscription.py +37 -0
  30. khoj/database/migrations/0014_alter_googleuser_picture.py +17 -0
  31. khoj/database/migrations/0015_alter_subscription_user.py +21 -0
  32. khoj/database/migrations/0016_alter_subscription_renewal_date.py +17 -0
  33. khoj/database/migrations/0017_searchmodel.py +32 -0
  34. khoj/database/migrations/0018_searchmodelconfig_delete_searchmodel.py +30 -0
  35. khoj/database/migrations/0019_alter_googleuser_family_name_and_more.py +27 -0
  36. khoj/database/migrations/0020_reflectivequestion.py +36 -0
  37. khoj/database/migrations/0021_speechtotextmodeloptions_and_more.py +42 -0
  38. khoj/database/migrations/0022_texttoimagemodelconfig.py +25 -0
  39. khoj/database/migrations/0023_usersearchmodelconfig.py +33 -0
  40. khoj/database/migrations/0024_alter_entry_embeddings.py +18 -0
  41. khoj/database/migrations/0025_clientapplication_khojuser_phone_number_and_more.py +46 -0
  42. khoj/database/migrations/0025_searchmodelconfig_embeddings_inference_endpoint_and_more.py +22 -0
  43. khoj/database/migrations/0026_searchmodelconfig_cross_encoder_inference_endpoint_and_more.py +22 -0
  44. khoj/database/migrations/0027_merge_20240118_1324.py +13 -0
  45. khoj/database/migrations/0028_khojuser_verified_phone_number.py +17 -0
  46. khoj/database/migrations/0029_userrequests.py +27 -0
  47. khoj/database/migrations/0030_conversation_slug_and_title.py +38 -0
  48. khoj/database/migrations/0031_agent_conversation_agent.py +53 -0
  49. khoj/database/migrations/0031_alter_googleuser_locale.py +30 -0
  50. khoj/database/migrations/0032_merge_20240322_0427.py +14 -0
  51. khoj/database/migrations/0033_rename_tuning_agent_personality.py +17 -0
  52. khoj/database/migrations/0034_alter_chatmodeloptions_chat_model.py +32 -0
  53. khoj/database/migrations/0035_processlock.py +26 -0
  54. khoj/database/migrations/0036_alter_processlock_name.py +19 -0
  55. khoj/database/migrations/0036_delete_offlinechatprocessorconversationconfig.py +15 -0
  56. khoj/database/migrations/0036_publicconversation.py +42 -0
  57. khoj/database/migrations/0037_chatmodeloptions_openai_config_and_more.py +51 -0
  58. khoj/database/migrations/0037_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +32 -0
  59. khoj/database/migrations/0038_merge_20240425_0857.py +14 -0
  60. khoj/database/migrations/0038_merge_20240426_1640.py +12 -0
  61. khoj/database/migrations/0039_merge_20240501_0301.py +12 -0
  62. khoj/database/migrations/0040_alter_processlock_name.py +26 -0
  63. khoj/database/migrations/0040_merge_20240504_1010.py +14 -0
  64. khoj/database/migrations/0041_merge_20240505_1234.py +14 -0
  65. khoj/database/migrations/0042_serverchatsettings.py +46 -0
  66. khoj/database/migrations/0043_alter_chatmodeloptions_model_type.py +21 -0
  67. khoj/database/migrations/0044_conversation_file_filters.py +17 -0
  68. khoj/database/migrations/0045_fileobject.py +37 -0
  69. khoj/database/migrations/0046_khojuser_email_verification_code_and_more.py +22 -0
  70. khoj/database/migrations/0047_alter_entry_file_type.py +31 -0
  71. khoj/database/migrations/0048_voicemodeloption_uservoicemodelconfig.py +52 -0
  72. khoj/database/migrations/0049_datastore.py +38 -0
  73. khoj/database/migrations/0049_texttoimagemodelconfig_api_key_and_more.py +58 -0
  74. khoj/database/migrations/0050_alter_processlock_name.py +25 -0
  75. khoj/database/migrations/0051_merge_20240702_1220.py +14 -0
  76. khoj/database/migrations/0052_alter_searchmodelconfig_bi_encoder_docs_encode_config_and_more.py +27 -0
  77. khoj/database/migrations/__init__.py +0 -0
  78. khoj/database/models/__init__.py +402 -0
  79. khoj/database/tests.py +3 -0
  80. khoj/interface/email/feedback.html +34 -0
  81. khoj/interface/email/magic_link.html +17 -0
  82. khoj/interface/email/task.html +40 -0
  83. khoj/interface/email/welcome.html +61 -0
  84. khoj/interface/web/404.html +56 -0
  85. khoj/interface/web/agent.html +312 -0
  86. khoj/interface/web/agents.html +276 -0
  87. khoj/interface/web/assets/icons/agents.svg +6 -0
  88. khoj/interface/web/assets/icons/automation.svg +37 -0
  89. khoj/interface/web/assets/icons/cancel.svg +3 -0
  90. khoj/interface/web/assets/icons/chat.svg +24 -0
  91. khoj/interface/web/assets/icons/collapse.svg +17 -0
  92. khoj/interface/web/assets/icons/computer.png +0 -0
  93. khoj/interface/web/assets/icons/confirm-icon.svg +1 -0
  94. khoj/interface/web/assets/icons/copy-button-success.svg +6 -0
  95. khoj/interface/web/assets/icons/copy-button.svg +5 -0
  96. khoj/interface/web/assets/icons/credit-card.png +0 -0
  97. khoj/interface/web/assets/icons/delete.svg +26 -0
  98. khoj/interface/web/assets/icons/docx.svg +7 -0
  99. khoj/interface/web/assets/icons/edit.svg +4 -0
  100. khoj/interface/web/assets/icons/favicon-128x128.ico +0 -0
  101. khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
  102. khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
  103. khoj/interface/web/assets/icons/favicon.icns +0 -0
  104. khoj/interface/web/assets/icons/github.svg +1 -0
  105. khoj/interface/web/assets/icons/key.svg +4 -0
  106. khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
  107. khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
  108. khoj/interface/web/assets/icons/khoj-logo-sideways.svg +5385 -0
  109. khoj/interface/web/assets/icons/logotype.svg +1 -0
  110. khoj/interface/web/assets/icons/markdown.svg +1 -0
  111. khoj/interface/web/assets/icons/new.svg +23 -0
  112. khoj/interface/web/assets/icons/notion.svg +4 -0
  113. khoj/interface/web/assets/icons/openai-logomark.svg +1 -0
  114. khoj/interface/web/assets/icons/org.svg +1 -0
  115. khoj/interface/web/assets/icons/pdf.svg +23 -0
  116. khoj/interface/web/assets/icons/pencil-edit.svg +5 -0
  117. khoj/interface/web/assets/icons/plaintext.svg +1 -0
  118. khoj/interface/web/assets/icons/question-mark-icon.svg +1 -0
  119. khoj/interface/web/assets/icons/search.svg +25 -0
  120. khoj/interface/web/assets/icons/send.svg +1 -0
  121. khoj/interface/web/assets/icons/share.svg +8 -0
  122. khoj/interface/web/assets/icons/speaker.svg +4 -0
  123. khoj/interface/web/assets/icons/stop-solid.svg +37 -0
  124. khoj/interface/web/assets/icons/sync.svg +4 -0
  125. khoj/interface/web/assets/icons/thumbs-down-svgrepo-com.svg +6 -0
  126. khoj/interface/web/assets/icons/thumbs-up-svgrepo-com.svg +6 -0
  127. khoj/interface/web/assets/icons/user-silhouette.svg +4 -0
  128. khoj/interface/web/assets/icons/voice.svg +8 -0
  129. khoj/interface/web/assets/icons/web.svg +2 -0
  130. khoj/interface/web/assets/icons/whatsapp.svg +17 -0
  131. khoj/interface/web/assets/khoj.css +237 -0
  132. khoj/interface/web/assets/markdown-it.min.js +8476 -0
  133. khoj/interface/web/assets/natural-cron.min.js +1 -0
  134. khoj/interface/web/assets/org.min.js +1823 -0
  135. khoj/interface/web/assets/pico.min.css +5 -0
  136. khoj/interface/web/assets/purify.min.js +3 -0
  137. khoj/interface/web/assets/samples/desktop-browse-draw-sample.png +0 -0
  138. khoj/interface/web/assets/samples/desktop-plain-chat-sample.png +0 -0
  139. khoj/interface/web/assets/samples/desktop-remember-plan-sample.png +0 -0
  140. khoj/interface/web/assets/samples/phone-browse-draw-sample.png +0 -0
  141. khoj/interface/web/assets/samples/phone-plain-chat-sample.png +0 -0
  142. khoj/interface/web/assets/samples/phone-remember-plan-sample.png +0 -0
  143. khoj/interface/web/assets/utils.js +33 -0
  144. khoj/interface/web/base_config.html +445 -0
  145. khoj/interface/web/chat.html +3546 -0
  146. khoj/interface/web/config.html +1011 -0
  147. khoj/interface/web/config_automation.html +1103 -0
  148. khoj/interface/web/content_source_computer_input.html +139 -0
  149. khoj/interface/web/content_source_github_input.html +216 -0
  150. khoj/interface/web/content_source_notion_input.html +94 -0
  151. khoj/interface/web/khoj.webmanifest +51 -0
  152. khoj/interface/web/login.html +219 -0
  153. khoj/interface/web/public_conversation.html +2006 -0
  154. khoj/interface/web/search.html +470 -0
  155. khoj/interface/web/utils.html +48 -0
  156. khoj/main.py +241 -0
  157. khoj/manage.py +22 -0
  158. khoj/migrations/__init__.py +0 -0
  159. khoj/migrations/migrate_offline_chat_default_model.py +69 -0
  160. khoj/migrations/migrate_offline_chat_default_model_2.py +71 -0
  161. khoj/migrations/migrate_offline_chat_schema.py +83 -0
  162. khoj/migrations/migrate_offline_model.py +29 -0
  163. khoj/migrations/migrate_processor_config_openai.py +67 -0
  164. khoj/migrations/migrate_server_pg.py +138 -0
  165. khoj/migrations/migrate_version.py +17 -0
  166. khoj/processor/__init__.py +0 -0
  167. khoj/processor/content/__init__.py +0 -0
  168. khoj/processor/content/docx/__init__.py +0 -0
  169. khoj/processor/content/docx/docx_to_entries.py +110 -0
  170. khoj/processor/content/github/__init__.py +0 -0
  171. khoj/processor/content/github/github_to_entries.py +224 -0
  172. khoj/processor/content/images/__init__.py +0 -0
  173. khoj/processor/content/images/image_to_entries.py +118 -0
  174. khoj/processor/content/markdown/__init__.py +0 -0
  175. khoj/processor/content/markdown/markdown_to_entries.py +165 -0
  176. khoj/processor/content/notion/notion_to_entries.py +260 -0
  177. khoj/processor/content/org_mode/__init__.py +0 -0
  178. khoj/processor/content/org_mode/org_to_entries.py +231 -0
  179. khoj/processor/content/org_mode/orgnode.py +532 -0
  180. khoj/processor/content/pdf/__init__.py +0 -0
  181. khoj/processor/content/pdf/pdf_to_entries.py +116 -0
  182. khoj/processor/content/plaintext/__init__.py +0 -0
  183. khoj/processor/content/plaintext/plaintext_to_entries.py +122 -0
  184. khoj/processor/content/text_to_entries.py +297 -0
  185. khoj/processor/conversation/__init__.py +0 -0
  186. khoj/processor/conversation/anthropic/__init__.py +0 -0
  187. khoj/processor/conversation/anthropic/anthropic_chat.py +206 -0
  188. khoj/processor/conversation/anthropic/utils.py +114 -0
  189. khoj/processor/conversation/offline/__init__.py +0 -0
  190. khoj/processor/conversation/offline/chat_model.py +231 -0
  191. khoj/processor/conversation/offline/utils.py +78 -0
  192. khoj/processor/conversation/offline/whisper.py +15 -0
  193. khoj/processor/conversation/openai/__init__.py +0 -0
  194. khoj/processor/conversation/openai/gpt.py +187 -0
  195. khoj/processor/conversation/openai/utils.py +129 -0
  196. khoj/processor/conversation/openai/whisper.py +13 -0
  197. khoj/processor/conversation/prompts.py +758 -0
  198. khoj/processor/conversation/utils.py +262 -0
  199. khoj/processor/embeddings.py +117 -0
  200. khoj/processor/speech/__init__.py +0 -0
  201. khoj/processor/speech/text_to_speech.py +51 -0
  202. khoj/processor/tools/__init__.py +0 -0
  203. khoj/processor/tools/online_search.py +225 -0
  204. khoj/routers/__init__.py +0 -0
  205. khoj/routers/api.py +626 -0
  206. khoj/routers/api_agents.py +43 -0
  207. khoj/routers/api_chat.py +1180 -0
  208. khoj/routers/api_config.py +434 -0
  209. khoj/routers/api_phone.py +86 -0
  210. khoj/routers/auth.py +181 -0
  211. khoj/routers/email.py +133 -0
  212. khoj/routers/helpers.py +1188 -0
  213. khoj/routers/indexer.py +349 -0
  214. khoj/routers/notion.py +91 -0
  215. khoj/routers/storage.py +35 -0
  216. khoj/routers/subscription.py +104 -0
  217. khoj/routers/twilio.py +36 -0
  218. khoj/routers/web_client.py +471 -0
  219. khoj/search_filter/__init__.py +0 -0
  220. khoj/search_filter/base_filter.py +15 -0
  221. khoj/search_filter/date_filter.py +217 -0
  222. khoj/search_filter/file_filter.py +30 -0
  223. khoj/search_filter/word_filter.py +29 -0
  224. khoj/search_type/__init__.py +0 -0
  225. khoj/search_type/text_search.py +241 -0
  226. khoj/utils/__init__.py +0 -0
  227. khoj/utils/cli.py +93 -0
  228. khoj/utils/config.py +81 -0
  229. khoj/utils/constants.py +24 -0
  230. khoj/utils/fs_syncer.py +249 -0
  231. khoj/utils/helpers.py +418 -0
  232. khoj/utils/initialization.py +146 -0
  233. khoj/utils/jsonl.py +43 -0
  234. khoj/utils/models.py +47 -0
  235. khoj/utils/rawconfig.py +160 -0
  236. khoj/utils/state.py +46 -0
  237. khoj/utils/yaml.py +43 -0
  238. khoj-1.16.1.dev15.dist-info/METADATA +178 -0
  239. khoj-1.16.1.dev15.dist-info/RECORD +242 -0
  240. khoj-1.16.1.dev15.dist-info/WHEEL +4 -0
  241. khoj-1.16.1.dev15.dist-info/entry_points.txt +2 -0
  242. khoj-1.16.1.dev15.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,217 @@
1
+ import calendar
2
+ import logging
3
+ import re
4
+ from collections import defaultdict
5
+ from datetime import datetime, timedelta
6
+ from math import inf
7
+ from typing import List, Tuple
8
+
9
+ import dateparser as dtparse
10
+ from dateparser.search import search_dates
11
+ from dateparser_data.settings import default_parsers
12
+ from dateutil.relativedelta import relativedelta
13
+
14
+ from khoj.search_filter.base_filter import BaseFilter
15
+ from khoj.utils.helpers import LRU, merge_dicts, timer
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class DateFilter(BaseFilter):
21
+ # Date Range Filter Regexes
22
+ # Example filter queries:
23
+ # - dt>="yesterday" dt<"tomorrow"
24
+ # - dt>="last week"
25
+ # - dt:"2 years ago"
26
+ date_regex = r"dt([:><=]{1,2})[\"'](.*?)[\"']"
27
+
28
+ def __init__(self, entry_key="compiled"):
29
+ self.entry_key = entry_key
30
+ self.date_to_entry_ids = defaultdict(set)
31
+ self.cache = LRU()
32
+ self.dtparser_regexes = self.compile_date_regexes()
33
+ self.dtparser_ordinal_suffixes = re.compile(r"(st|nd|rd|th)")
34
+ self.dtparser_settings = {
35
+ "PREFER_DAY_OF_MONTH": "first",
36
+ "DATE_ORDER": "YMD", # Prefer YMD and DMY over MDY when parsing ambiguous dates
37
+ }
38
+
39
+ def compile_date_regexes(self):
40
+ months = calendar.month_name[1:]
41
+ abbr_months = calendar.month_abbr[1:]
42
+ # Extract natural dates from content like 1st April 1984, 31 April 84, Apr 4th 1984, 13 Apr 84
43
+ dBY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE)
44
+ dBy_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE)
45
+ BdY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE)
46
+ Bdy_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE)
47
+ dbY_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE)
48
+ dby_regex = re.compile(r"\b\d{1,2}(?:st|nd|rd|th)? (?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE)
49
+ bdY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{4}\b", re.IGNORECASE)
50
+ bdy_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{1,2}(?:st|nd|rd|th)? \d{2}\b", re.IGNORECASE)
51
+ # Extract natural of form Month, Year like January 2021, Jan 2021, Jan 21
52
+ BY_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{4}\b", re.IGNORECASE)
53
+ By_regex = re.compile(r"\b(?:" + "|".join(months) + r") \d{2}\b", re.IGNORECASE)
54
+ bY_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{4}\b", re.IGNORECASE)
55
+ by_regex = re.compile(r"\b(?:" + "|".join(abbr_months) + r") \d{2}\b", re.IGNORECASE)
56
+ # Extract structured dates from content like 1984-04-01, 1984/04/01, 01-04-1984, 01/04/1984, 01.04.1984, 01-04-84, 01/04/84
57
+ Ymd_date_regex = re.compile(r"\b\d{4}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE)
58
+ dmY_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{4}\b", re.IGNORECASE)
59
+ dmy_date_regex = re.compile(r"\b\d{2}[-\/]\d{2}[-\/]\d{2}\b", re.IGNORECASE)
60
+ dmY_dot_date_regex = re.compile(r"\b\d{2}[\.]\d{2}[\.]\d{4}\b", re.IGNORECASE)
61
+
62
+ # Combine date formatter and date identifier regex pairs
63
+ dtparser_regexes: List[Tuple[str, re.Pattern[str]]] = [
64
+ # Structured dates
65
+ ("%Y-%m-%d", Ymd_date_regex),
66
+ ("%Y/%m/%d", Ymd_date_regex),
67
+ ("%d-%m-%Y", dmY_date_regex),
68
+ ("%d/%m/%Y", dmY_date_regex),
69
+ ("%d.%m.%Y", dmY_dot_date_regex),
70
+ ("%d-%m-%y", dmy_date_regex),
71
+ ("%d/%m/%y", dmy_date_regex),
72
+ # Natural dates
73
+ ("%d %B %Y", dBY_regex),
74
+ ("%d %B %y", dBy_regex),
75
+ ("%B %d %Y", BdY_regex),
76
+ ("%B %d %y", Bdy_regex),
77
+ ("%d %b %Y", dbY_regex),
78
+ ("%d %b %y", dby_regex),
79
+ ("%b %d %Y", bdY_regex),
80
+ ("%b %d %y", bdy_regex),
81
+ # Partial natural dates
82
+ ("%B %Y", BY_regex),
83
+ ("%B %y", By_regex),
84
+ ("%b %Y", bY_regex),
85
+ ("%b %y", by_regex),
86
+ ]
87
+ return dtparser_regexes
88
+
89
+ def extract_dates(self, content):
90
+ "Extract natural and structured dates from content"
91
+ valid_dates = set()
92
+ for date_format, date_regex in self.dtparser_regexes:
93
+ matched_dates = date_regex.findall(content)
94
+ for date_str in matched_dates:
95
+ # Remove ordinal suffixes to parse date
96
+ date_str = self.dtparser_ordinal_suffixes.sub("", date_str)
97
+ try:
98
+ valid_dates.add(datetime.strptime(date_str, date_format))
99
+ except ValueError:
100
+ continue
101
+
102
+ return list(valid_dates)
103
+
104
+ def get_filter_terms(self, query: str) -> List[str]:
105
+ "Get all filter terms in query"
106
+ return [f"dt{item[0]}'{item[1]}'" for item in re.findall(self.date_regex, query)]
107
+
108
+ def get_query_date_range(self, query) -> List:
109
+ with timer("Extract date range to filter from query", logger):
110
+ query_daterange = self.extract_date_range(query)
111
+
112
+ return query_daterange
113
+
114
+ def defilter(self, query):
115
+ # remove date range filter from query
116
+ query = re.sub(rf"\s+{self.date_regex}", " ", query)
117
+ query = re.sub(r"\s{2,}", " ", query).strip() # remove multiple spaces
118
+ return query
119
+
120
+ def extract_date_range(self, query):
121
+ # find date range filter in query
122
+ date_range_matches = re.findall(self.date_regex, query)
123
+
124
+ if len(date_range_matches) == 0:
125
+ return []
126
+
127
+ # extract, parse natural dates ranges from date range filter passed in query
128
+ # e.g. today maps to (start_of_day, start_of_tomorrow)
129
+ date_ranges_from_filter = []
130
+ for cmp, date_str in date_range_matches:
131
+ if self.parse(date_str):
132
+ dt_start, dt_end = self.parse(date_str)
133
+ date_ranges_from_filter += [[cmp, (dt_start.timestamp(), dt_end.timestamp())]]
134
+
135
+ # Combine dates with their comparators to form date range intervals
136
+ # For e.g.
137
+ # >=yesterday maps to [start_of_yesterday, inf)
138
+ # <tomorrow maps to [0, start_of_tomorrow)
139
+ # ---
140
+ effective_date_range: List = [0, inf]
141
+ date_range_considering_comparator = []
142
+ for cmp, (dtrange_start, dtrange_end) in date_ranges_from_filter:
143
+ if cmp == ">":
144
+ date_range_considering_comparator += [[dtrange_end, inf]]
145
+ elif cmp == ">=":
146
+ date_range_considering_comparator += [[dtrange_start, inf]]
147
+ elif cmp == "<":
148
+ date_range_considering_comparator += [[0, dtrange_start]]
149
+ elif cmp == "<=":
150
+ date_range_considering_comparator += [[0, dtrange_end]]
151
+ elif cmp == "=" or cmp == ":" or cmp == "==":
152
+ date_range_considering_comparator += [[dtrange_start, dtrange_end]]
153
+
154
+ # Combine above intervals (via AND/intersect)
155
+ # In the above example, this gives us [start_of_yesterday, start_of_tomorrow)
156
+ # This is the effective date range to filter entries by
157
+ # ---
158
+ for date_range in date_range_considering_comparator:
159
+ effective_date_range = [
160
+ max(effective_date_range[0], date_range[0]),
161
+ min(effective_date_range[1], date_range[1]),
162
+ ]
163
+
164
+ if effective_date_range == [0, inf] or effective_date_range[0] > effective_date_range[1]:
165
+ return []
166
+ else:
167
+ # If the first element is 0, replace it with None
168
+
169
+ if effective_date_range[0] == 0:
170
+ effective_date_range[0] = None
171
+
172
+ # If the second element is inf, replace it with None
173
+ if effective_date_range[1] == inf:
174
+ effective_date_range[1] = None
175
+
176
+ return effective_date_range
177
+
178
+ def parse(self, date_str, relative_base=None):
179
+ "Parse date string passed in date filter of query to datetime object"
180
+ # clean date string to handle future date parsing by date parser
181
+ future_strings = ["later", "from now", "from today"]
182
+ prefer_dates_from = {True: "future", False: "past"}[any([True for fstr in future_strings if fstr in date_str])]
183
+ dtquery_settings = {"RELATIVE_BASE": relative_base or datetime.now(), "PREFER_DATES_FROM": prefer_dates_from}
184
+ dtparser_settings = merge_dicts(dtquery_settings, self.dtparser_settings)
185
+
186
+ # parse date passed in query date filter
187
+ clean_date_str = re.sub("|".join(future_strings), "", date_str)
188
+ try:
189
+ parsed_date = dtparse.parse(clean_date_str, settings=dtparser_settings)
190
+ except Exception as e:
191
+ logger.error(f"Failed to parse date string: {date_str} with error: {e}")
192
+ return None
193
+
194
+ if parsed_date is None:
195
+ return None
196
+
197
+ return self.date_to_daterange(parsed_date, date_str)
198
+
199
+ def date_to_daterange(self, parsed_date, date_str):
200
+ "Convert parsed date to date ranges at natural granularity (day, week, month or year)"
201
+
202
+ start_of_day = parsed_date.replace(hour=0, minute=0, second=0, microsecond=0)
203
+
204
+ if "year" in date_str:
205
+ return (datetime(parsed_date.year, 1, 1, 0, 0, 0), datetime(parsed_date.year + 1, 1, 1, 0, 0, 0))
206
+ if "month" in date_str:
207
+ start_of_month = datetime(parsed_date.year, parsed_date.month, 1, 0, 0, 0)
208
+ next_month = start_of_month + relativedelta(months=1)
209
+ return (start_of_month, next_month)
210
+ if "week" in date_str:
211
+ # if week in date string, dateparser parses it to next week start
212
+ # so today = end of this week
213
+ start_of_week = start_of_day - timedelta(days=7)
214
+ return (start_of_week, start_of_day)
215
+ else:
216
+ next_day = start_of_day + relativedelta(days=1)
217
+ return (start_of_day, next_day)
@@ -0,0 +1,30 @@
1
+ import fnmatch
2
+ import logging
3
+ import re
4
+ from collections import defaultdict
5
+ from typing import List
6
+
7
+ from khoj.search_filter.base_filter import BaseFilter
8
+ from khoj.utils.helpers import LRU, timer
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class FileFilter(BaseFilter):
14
+ file_filter_regex = r'file:"(.+?)" ?'
15
+
16
+ def __init__(self, entry_key="file"):
17
+ self.entry_key = entry_key
18
+ self.file_to_entry_map = defaultdict(set)
19
+ self.cache = LRU()
20
+
21
+ def get_filter_terms(self, query: str) -> List[str]:
22
+ "Get all filter terms in query"
23
+ return [f"{self.convert_to_regex(term)}" for term in re.findall(self.file_filter_regex, query)]
24
+
25
+ def convert_to_regex(self, file_filter: str) -> str:
26
+ "Convert file filter to regex"
27
+ return file_filter.replace(".", r"\.").replace("*", r".*")
28
+
29
+ def defilter(self, query: str) -> str:
30
+ return re.sub(self.file_filter_regex, "", query).strip()
@@ -0,0 +1,29 @@
1
+ import logging
2
+ import re
3
+ from collections import defaultdict
4
+ from typing import List
5
+
6
+ from khoj.search_filter.base_filter import BaseFilter
7
+ from khoj.utils.helpers import LRU
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class WordFilter(BaseFilter):
13
+ # Filter Regex
14
+ required_regex = r'\+"([a-zA-Z0-9_-]+)" ?'
15
+ blocked_regex = r'\-"([a-zA-Z0-9_-]+)" ?'
16
+
17
+ def __init__(self, entry_key="raw"):
18
+ self.entry_key = entry_key
19
+ self.word_to_entry_index = defaultdict(set)
20
+ self.cache = LRU()
21
+
22
+ def get_filter_terms(self, query: str) -> List[str]:
23
+ "Get all filter terms in query"
24
+ required_terms = [f"+{required_term}" for required_term in re.findall(self.required_regex, query)]
25
+ blocked_terms = [f"-{blocked_term}" for blocked_term in re.findall(self.blocked_regex, query)]
26
+ return required_terms + blocked_terms
27
+
28
+ def defilter(self, query: str) -> str:
29
+ return re.sub(self.blocked_regex, "", re.sub(self.required_regex, "", query)).strip()
File without changes
@@ -0,0 +1,241 @@
1
+ import logging
2
+ import math
3
+ from pathlib import Path
4
+ from typing import List, Tuple, Type, Union
5
+
6
+ import torch
7
+ from asgiref.sync import sync_to_async
8
+ from sentence_transformers import util
9
+
10
+ from khoj.database.adapters import EntryAdapters, get_user_search_model_or_default
11
+ from khoj.database.models import Entry as DbEntry
12
+ from khoj.database.models import KhojUser
13
+ from khoj.processor.content.text_to_entries import TextToEntries
14
+ from khoj.utils import state
15
+ from khoj.utils.helpers import get_absolute_path, timer
16
+ from khoj.utils.jsonl import load_jsonl
17
+ from khoj.utils.models import BaseEncoder
18
+ from khoj.utils.rawconfig import Entry, SearchResponse
19
+ from khoj.utils.state import SearchType
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ search_type_to_embeddings_type = {
24
+ SearchType.Org.value: DbEntry.EntryType.ORG,
25
+ SearchType.Markdown.value: DbEntry.EntryType.MARKDOWN,
26
+ SearchType.Plaintext.value: DbEntry.EntryType.PLAINTEXT,
27
+ SearchType.Pdf.value: DbEntry.EntryType.PDF,
28
+ SearchType.Github.value: DbEntry.EntryType.GITHUB,
29
+ SearchType.Notion.value: DbEntry.EntryType.NOTION,
30
+ SearchType.All.value: None,
31
+ }
32
+
33
+
34
+ def extract_entries(jsonl_file) -> List[Entry]:
35
+ "Load entries from compressed jsonl"
36
+ return list(map(Entry.from_dict, load_jsonl(jsonl_file)))
37
+
38
+
39
+ def compute_embeddings(
40
+ entries_with_ids: List[Tuple[int, Entry]],
41
+ bi_encoder: BaseEncoder,
42
+ embeddings_file: Path,
43
+ regenerate=False,
44
+ normalize=True,
45
+ ):
46
+ "Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
47
+ new_embeddings = torch.tensor([], device=state.device)
48
+ existing_embeddings = torch.tensor([], device=state.device)
49
+ create_index_msg = ""
50
+ # Load pre-computed embeddings from file if exists and update them if required
51
+ if embeddings_file.exists() and not regenerate:
52
+ corpus_embeddings: torch.Tensor = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
53
+ logger.debug(f"Loaded {len(corpus_embeddings)} text embeddings from {embeddings_file}")
54
+ else:
55
+ corpus_embeddings = torch.tensor([], device=state.device)
56
+ create_index_msg = " Creating index from scratch."
57
+
58
+ # Encode any new entries in the corpus and update corpus embeddings
59
+ new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1]
60
+ if new_entries:
61
+ logger.info(f"📩 Indexing {len(new_entries)} text entries.{create_index_msg}")
62
+ new_embeddings = bi_encoder.encode(
63
+ new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True
64
+ )
65
+
66
+ # Extract existing embeddings from previous corpus embeddings
67
+ existing_entry_ids = [id for id, _ in entries_with_ids if id != -1]
68
+ if existing_entry_ids:
69
+ existing_embeddings = torch.index_select(
70
+ corpus_embeddings, 0, torch.tensor(existing_entry_ids, device=state.device)
71
+ )
72
+
73
+ # Set corpus embeddings to merger of existing and new embeddings
74
+ corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0)
75
+ if normalize:
76
+ # Normalize embeddings for faster lookup via dot product when querying
77
+ corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
78
+
79
+ # Save regenerated or updated embeddings to file
80
+ torch.save(corpus_embeddings, embeddings_file)
81
+ logger.info(f"📩 Saved computed text embeddings to {embeddings_file}")
82
+
83
+ return corpus_embeddings
84
+
85
+
86
+ def load_embeddings(
87
+ embeddings_file: Path,
88
+ ):
89
+ "Load pre-computed embeddings from file if exists and update them if required"
90
+ if embeddings_file.exists():
91
+ corpus_embeddings: torch.Tensor = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
92
+ logger.debug(f"Loaded {len(corpus_embeddings)} text embeddings from {embeddings_file}")
93
+ return util.normalize_embeddings(corpus_embeddings)
94
+
95
+ return None
96
+
97
+
98
+ async def query(
99
+ user: KhojUser,
100
+ raw_query: str,
101
+ type: SearchType = SearchType.All,
102
+ question_embedding: Union[torch.Tensor, None] = None,
103
+ max_distance: float = math.inf,
104
+ ) -> Tuple[List[dict], List[Entry]]:
105
+ "Search for entries that answer the query"
106
+
107
+ file_type = search_type_to_embeddings_type[type.value]
108
+
109
+ query = raw_query
110
+
111
+ # Encode the query using the bi-encoder
112
+ if question_embedding is None:
113
+ with timer("Query Encode Time", logger, state.device):
114
+ search_model = await sync_to_async(get_user_search_model_or_default)(user)
115
+ question_embedding = state.embeddings_model[search_model.name].embed_query(query)
116
+
117
+ # Find relevant entries for the query
118
+ top_k = 10
119
+ with timer("Search Time", logger, state.device):
120
+ hits = EntryAdapters.search_with_embeddings(
121
+ user=user,
122
+ embeddings=question_embedding,
123
+ max_results=top_k,
124
+ file_type_filter=file_type,
125
+ raw_query=raw_query,
126
+ max_distance=max_distance,
127
+ ).all()
128
+ hits = await sync_to_async(list)(hits) # type: ignore[call-arg]
129
+
130
+ return hits
131
+
132
+
133
+ def collate_results(hits, dedupe=True):
134
+ hit_ids = set()
135
+ hit_hashes = set()
136
+ for hit in hits:
137
+ if dedupe and (hit.hashed_value in hit_hashes or hit.corpus_id in hit_ids):
138
+ continue
139
+
140
+ else:
141
+ hit_hashes.add(hit.hashed_value)
142
+ hit_ids.add(hit.corpus_id)
143
+ yield SearchResponse.model_validate(
144
+ {
145
+ "entry": hit.raw,
146
+ "score": hit.distance,
147
+ "corpus_id": str(hit.corpus_id),
148
+ "additional": {
149
+ "source": hit.file_source,
150
+ "file": hit.file_path,
151
+ "compiled": hit.compiled,
152
+ "heading": hit.heading,
153
+ },
154
+ }
155
+ )
156
+
157
+
158
+ def deduplicated_search_responses(hits: List[SearchResponse]):
159
+ hit_ids = set()
160
+ for hit in hits:
161
+ if hit.corpus_id in hit_ids:
162
+ continue
163
+
164
+ else:
165
+ hit_ids.add(hit.corpus_id)
166
+ yield SearchResponse.model_validate(
167
+ {
168
+ "entry": hit.entry,
169
+ "score": hit.score,
170
+ "corpus_id": hit.corpus_id,
171
+ "additional": {
172
+ "source": hit.additional["source"],
173
+ "file": hit.additional["file"],
174
+ "compiled": hit.additional["compiled"],
175
+ "heading": hit.additional["heading"],
176
+ },
177
+ }
178
+ )
179
+
180
+
181
+ def rerank_and_sort_results(hits, query, rank_results, search_model_name):
182
+ # Rerank results if explicitly requested, if can use inference server
183
+ # AND if we have more than one result
184
+ rank_results = (rank_results or state.cross_encoder_model[search_model_name].inference_server_enabled()) and len(
185
+ list(hits)
186
+ ) > 1
187
+
188
+ # Score all retrieved entries using the cross-encoder
189
+ if rank_results:
190
+ hits = cross_encoder_score(query, hits, search_model_name)
191
+
192
+ # Sort results by cross-encoder score followed by bi-encoder score
193
+ hits = sort_results(rank_results=rank_results, hits=hits)
194
+
195
+ return hits
196
+
197
+
198
+ def setup(
199
+ text_to_entries: Type[TextToEntries],
200
+ files: dict[str, str],
201
+ regenerate: bool,
202
+ full_corpus: bool = True,
203
+ user: KhojUser = None,
204
+ config=None,
205
+ ) -> None:
206
+ if config:
207
+ num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process(
208
+ files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
209
+ )
210
+ else:
211
+ num_new_embeddings, num_deleted_embeddings = text_to_entries().process(
212
+ files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
213
+ )
214
+
215
+ if files:
216
+ file_names = [file_name for file_name in files]
217
+
218
+ logger.info(
219
+ f"Deleted {num_deleted_embeddings} entries. Created {num_new_embeddings} new entries for user {user} from files {file_names[:10]} ..."
220
+ )
221
+
222
+
223
+ def cross_encoder_score(query: str, hits: List[SearchResponse], search_model_name: str) -> List[SearchResponse]:
224
+ """Score all retrieved entries using the cross-encoder"""
225
+ with timer("Cross-Encoder Predict Time", logger, state.device):
226
+ cross_scores = state.cross_encoder_model[search_model_name].predict(query, hits)
227
+
228
+ # Convert cross-encoder scores to distances and pass in hits for reranking
229
+ for idx in range(len(cross_scores)):
230
+ hits[idx]["cross_score"] = 1 - cross_scores[idx]
231
+
232
+ return hits
233
+
234
+
235
+ def sort_results(rank_results: bool, hits: List[dict]) -> List[dict]:
236
+ """Order results by cross-encoder score followed by bi-encoder score"""
237
+ with timer("Rank Time", logger, state.device):
238
+ hits.sort(key=lambda x: x["score"]) # sort by bi-encoder score
239
+ if rank_results:
240
+ hits.sort(key=lambda x: x["cross_score"]) # sort by cross-encoder score
241
+ return hits
khoj/utils/__init__.py ADDED
File without changes
khoj/utils/cli.py ADDED
@@ -0,0 +1,93 @@
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import pathlib
5
+ from importlib.metadata import version
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ from khoj.migrations.migrate_offline_chat_default_model import (
10
+ migrate_offline_chat_default_model,
11
+ )
12
+ from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
13
+ from khoj.migrations.migrate_offline_model import migrate_offline_model
14
+ from khoj.migrations.migrate_processor_config_openai import (
15
+ migrate_processor_conversation_schema,
16
+ )
17
+ from khoj.migrations.migrate_server_pg import migrate_server_pg
18
+ from khoj.migrations.migrate_version import migrate_config_to_version
19
+ from khoj.utils.helpers import in_debug_mode, resolve_absolute_path
20
+ from khoj.utils.yaml import parse_config_from_file
21
+
22
+
23
+ def cli(args=None):
24
+ # Setup Argument Parser for the Commandline Interface
25
+ parser = argparse.ArgumentParser(description="Start Khoj; An AI personal assistant for your Digital Brain")
26
+ parser.add_argument(
27
+ "--config-file", default="~/.khoj/khoj.yml", type=pathlib.Path, help="YAML file to configure Khoj"
28
+ )
29
+ parser.add_argument(
30
+ "--regenerate",
31
+ action="store_true",
32
+ default=False,
33
+ help="Regenerate model embeddings from source files. Default: false",
34
+ )
35
+ parser.add_argument("--verbose", "-v", action="count", default=0, help="Show verbose conversion logs. Default: 0")
36
+ parser.add_argument("--host", type=str, default="127.0.0.1", help="Host address of the server. Default: 127.0.0.1")
37
+ parser.add_argument("--port", "-p", type=int, default=42110, help="Port of the server. Default: 42110")
38
+ parser.add_argument(
39
+ "--socket",
40
+ type=pathlib.Path,
41
+ help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock",
42
+ )
43
+ parser.add_argument("--version", "-V", action="store_true", help="Print the installed Khoj version and exit")
44
+ parser.add_argument(
45
+ "--disable-chat-on-gpu", action="store_true", default=False, help="Disable using GPU for the offline chat model"
46
+ )
47
+ parser.add_argument(
48
+ "--anonymous-mode",
49
+ action="store_true",
50
+ default=False,
51
+ help="Run Khoj in anonymous mode. This does not require any login for connecting users.",
52
+ )
53
+
54
+ args, remaining_args = parser.parse_known_args(args)
55
+
56
+ if len(remaining_args) > 0:
57
+ logger.info(f"⚠️ Ignoring unknown commandline args: {remaining_args}")
58
+
59
+ # Set default values for arguments
60
+ args.chat_on_gpu = not args.disable_chat_on_gpu
61
+
62
+ args.version_no = version("khoj")
63
+ if args.version:
64
+ # Show version of khoj installed and exit
65
+ print(args.version_no)
66
+ exit(0)
67
+
68
+ # Normalize config_file path to absolute path
69
+ args.config_file = resolve_absolute_path(args.config_file)
70
+
71
+ if not args.config_file.exists():
72
+ args.config = None
73
+ else:
74
+ args = run_migrations(args)
75
+ args.config = parse_config_from_file(args.config_file)
76
+ if in_debug_mode():
77
+ args.config.app.should_log_telemetry = False
78
+
79
+ return args
80
+
81
+
82
+ def run_migrations(args):
83
+ migrations = [
84
+ migrate_config_to_version,
85
+ migrate_processor_conversation_schema,
86
+ migrate_offline_model,
87
+ migrate_offline_chat_schema,
88
+ migrate_offline_chat_default_model,
89
+ migrate_server_pg,
90
+ ]
91
+ for migration in migrations:
92
+ args = migration(args)
93
+ return args