khoj 1.17.0__py3-none-any.whl → 1.17.1.dev217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. khoj/configure.py +6 -6
  2. khoj/database/adapters/__init__.py +47 -2
  3. khoj/database/migrations/0053_agent_style_color_agent_style_icon.py +61 -0
  4. khoj/database/models/__init__.py +35 -0
  5. khoj/interface/web/assets/icons/favicon-128x128.png +0 -0
  6. khoj/interface/web/assets/icons/favicon-256x256.png +0 -0
  7. khoj/interface/web/assets/icons/khoj-logo-sideways-200.png +0 -0
  8. khoj/interface/web/assets/icons/khoj-logo-sideways-500.png +0 -0
  9. khoj/interface/web/assets/icons/khoj-logo-sideways.svg +31 -5384
  10. khoj/interface/web/assets/icons/khoj.svg +26 -0
  11. khoj/interface/web/chat.html +5 -5
  12. khoj/interface/web/content_source_computer_input.html +3 -3
  13. khoj/interface/web/content_source_github_input.html +1 -1
  14. khoj/interface/web/content_source_notion_input.html +1 -1
  15. khoj/interface/web/public_conversation.html +1 -1
  16. khoj/interface/web/search.html +2 -2
  17. khoj/interface/web/{config.html → settings.html} +30 -30
  18. khoj/interface/web/utils.html +1 -1
  19. khoj/processor/content/docx/docx_to_entries.py +4 -9
  20. khoj/processor/content/github/github_to_entries.py +1 -3
  21. khoj/processor/content/images/image_to_entries.py +4 -9
  22. khoj/processor/content/markdown/markdown_to_entries.py +4 -9
  23. khoj/processor/content/notion/notion_to_entries.py +1 -3
  24. khoj/processor/content/org_mode/org_to_entries.py +4 -9
  25. khoj/processor/content/pdf/pdf_to_entries.py +4 -9
  26. khoj/processor/content/plaintext/plaintext_to_entries.py +4 -9
  27. khoj/processor/content/text_to_entries.py +1 -3
  28. khoj/processor/tools/online_search.py +4 -4
  29. khoj/routers/api.py +49 -4
  30. khoj/routers/api_agents.py +3 -1
  31. khoj/routers/api_chat.py +80 -88
  32. khoj/routers/api_content.py +538 -0
  33. khoj/routers/api_model.py +156 -0
  34. khoj/routers/helpers.py +308 -7
  35. khoj/routers/notion.py +2 -8
  36. khoj/routers/web_client.py +43 -256
  37. khoj/search_type/text_search.py +5 -4
  38. khoj/utils/fs_syncer.py +3 -1
  39. khoj/utils/rawconfig.py +6 -1
  40. {khoj-1.17.0.dist-info → khoj-1.17.1.dev217.dist-info}/METADATA +2 -2
  41. {khoj-1.17.0.dist-info → khoj-1.17.1.dev217.dist-info}/RECORD +44 -42
  42. khoj/routers/api_config.py +0 -434
  43. khoj/routers/indexer.py +0 -349
  44. {khoj-1.17.0.dist-info → khoj-1.17.1.dev217.dist-info}/WHEEL +0 -0
  45. {khoj-1.17.0.dist-info → khoj-1.17.1.dev217.dist-info}/entry_points.txt +0 -0
  46. {khoj-1.17.0.dist-info → khoj-1.17.1.dev217.dist-info}/licenses/LICENSE +0 -0
khoj/routers/indexer.py DELETED
@@ -1,349 +0,0 @@
1
- import asyncio
2
- import logging
3
- from typing import Dict, Optional, Union
4
-
5
- from fastapi import APIRouter, Depends, Header, Request, Response, UploadFile
6
- from pydantic import BaseModel
7
- from starlette.authentication import requires
8
-
9
- from khoj.database.models import GithubConfig, KhojUser, NotionConfig
10
- from khoj.processor.content.docx.docx_to_entries import DocxToEntries
11
- from khoj.processor.content.github.github_to_entries import GithubToEntries
12
- from khoj.processor.content.images.image_to_entries import ImageToEntries
13
- from khoj.processor.content.markdown.markdown_to_entries import MarkdownToEntries
14
- from khoj.processor.content.notion.notion_to_entries import NotionToEntries
15
- from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
16
- from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
17
- from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
18
- from khoj.routers.helpers import ApiIndexedDataLimiter, update_telemetry_state
19
- from khoj.search_type import text_search
20
- from khoj.utils import constants, state
21
- from khoj.utils.config import SearchModels
22
- from khoj.utils.helpers import LRU, get_file_type
23
- from khoj.utils.rawconfig import ContentConfig, FullConfig, SearchConfig
24
- from khoj.utils.yaml import save_config_to_file_updated_state
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
- indexer = APIRouter()
29
-
30
-
31
- class File(BaseModel):
32
- path: str
33
- content: Union[str, bytes]
34
-
35
-
36
- class IndexBatchRequest(BaseModel):
37
- files: list[File]
38
-
39
-
40
- class IndexerInput(BaseModel):
41
- org: Optional[dict[str, str]] = None
42
- markdown: Optional[dict[str, str]] = None
43
- pdf: Optional[dict[str, bytes]] = None
44
- plaintext: Optional[dict[str, str]] = None
45
- image: Optional[dict[str, bytes]] = None
46
- docx: Optional[dict[str, bytes]] = None
47
-
48
-
49
- @indexer.post("/update")
50
- @requires(["authenticated"])
51
- async def update(
52
- request: Request,
53
- files: list[UploadFile],
54
- force: bool = False,
55
- t: Optional[Union[state.SearchType, str]] = state.SearchType.All,
56
- client: Optional[str] = None,
57
- user_agent: Optional[str] = Header(None),
58
- referer: Optional[str] = Header(None),
59
- host: Optional[str] = Header(None),
60
- indexed_data_limiter: ApiIndexedDataLimiter = Depends(
61
- ApiIndexedDataLimiter(
62
- incoming_entries_size_limit=10,
63
- subscribed_incoming_entries_size_limit=75,
64
- total_entries_size_limit=10,
65
- subscribed_total_entries_size_limit=100,
66
- )
67
- ),
68
- ):
69
- user = request.user.object
70
- index_files: Dict[str, Dict[str, str]] = {
71
- "org": {},
72
- "markdown": {},
73
- "pdf": {},
74
- "plaintext": {},
75
- "image": {},
76
- "docx": {},
77
- }
78
- try:
79
- logger.info(f"📬 Updating content index via API call by {client} client")
80
- for file in files:
81
- file_content = file.file.read()
82
- file_type, encoding = get_file_type(file.content_type, file_content)
83
- if file_type in index_files:
84
- index_files[file_type][file.filename] = file_content.decode(encoding) if encoding else file_content
85
- else:
86
- logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
87
-
88
- indexer_input = IndexerInput(
89
- org=index_files["org"],
90
- markdown=index_files["markdown"],
91
- pdf=index_files["pdf"],
92
- plaintext=index_files["plaintext"],
93
- image=index_files["image"],
94
- docx=index_files["docx"],
95
- )
96
-
97
- if state.config == None:
98
- logger.info("📬 Initializing content index on first run.")
99
- default_full_config = FullConfig(
100
- content_type=None,
101
- search_type=SearchConfig.model_validate(constants.default_config["search-type"]),
102
- processor=None,
103
- )
104
- state.config = default_full_config
105
- default_content_config = ContentConfig(
106
- org=None,
107
- markdown=None,
108
- pdf=None,
109
- docx=None,
110
- image=None,
111
- github=None,
112
- notion=None,
113
- plaintext=None,
114
- )
115
- state.config.content_type = default_content_config
116
- save_config_to_file_updated_state()
117
- configure_search(state.search_models, state.config.search_type)
118
-
119
- # Extract required fields from config
120
- loop = asyncio.get_event_loop()
121
- success = await loop.run_in_executor(
122
- None,
123
- configure_content,
124
- indexer_input.model_dump(),
125
- force,
126
- t,
127
- False,
128
- user,
129
- )
130
- if not success:
131
- raise RuntimeError("Failed to update content index")
132
- logger.info(f"Finished processing batch indexing request")
133
- except Exception as e:
134
- logger.error(f"Failed to process batch indexing request: {e}", exc_info=True)
135
- logger.error(
136
- f'🚨 Failed to {"force " if force else ""}update {t} content index triggered via API call by {client} client: {e}',
137
- exc_info=True,
138
- )
139
- return Response(content="Failed", status_code=500)
140
-
141
- indexing_metadata = {
142
- "num_org": len(index_files["org"]),
143
- "num_markdown": len(index_files["markdown"]),
144
- "num_pdf": len(index_files["pdf"]),
145
- "num_plaintext": len(index_files["plaintext"]),
146
- "num_image": len(index_files["image"]),
147
- "num_docx": len(index_files["docx"]),
148
- }
149
-
150
- update_telemetry_state(
151
- request=request,
152
- telemetry_type="api",
153
- api="index/update",
154
- client=client,
155
- user_agent=user_agent,
156
- referer=referer,
157
- host=host,
158
- metadata=indexing_metadata,
159
- )
160
-
161
- logger.info(f"📪 Content index updated via API call by {client} client")
162
-
163
- indexed_filenames = ",".join(file for ctype in index_files for file in index_files[ctype]) or ""
164
- return Response(content=indexed_filenames, status_code=200)
165
-
166
-
167
- def configure_search(search_models: SearchModels, search_config: Optional[SearchConfig]) -> Optional[SearchModels]:
168
- # Run Validation Checks
169
- if search_models is None:
170
- search_models = SearchModels()
171
-
172
- return search_models
173
-
174
-
175
- def configure_content(
176
- files: Optional[dict[str, dict[str, str]]],
177
- regenerate: bool = False,
178
- t: Optional[state.SearchType] = state.SearchType.All,
179
- full_corpus: bool = True,
180
- user: KhojUser = None,
181
- ) -> bool:
182
- success = True
183
- if t == None:
184
- t = state.SearchType.All
185
-
186
- if t is not None and t in [type.value for type in state.SearchType]:
187
- t = state.SearchType(t)
188
-
189
- if t is not None and not t.value in [type.value for type in state.SearchType]:
190
- logger.warning(f"🚨 Invalid search type: {t}")
191
- return False
192
-
193
- search_type = t.value if t else None
194
-
195
- no_documents = all([not files.get(file_type) for file_type in files])
196
-
197
- if files is None:
198
- logger.warning(f"🚨 No files to process for {search_type} search.")
199
- return True
200
-
201
- try:
202
- # Initialize Org Notes Search
203
- if (search_type == state.SearchType.All.value or search_type == state.SearchType.Org.value) and files["org"]:
204
- logger.info("🦄 Setting up search for orgmode notes")
205
- # Extract Entries, Generate Notes Embeddings
206
- text_search.setup(
207
- OrgToEntries,
208
- files.get("org"),
209
- regenerate=regenerate,
210
- full_corpus=full_corpus,
211
- user=user,
212
- )
213
- except Exception as e:
214
- logger.error(f"🚨 Failed to setup org: {e}", exc_info=True)
215
- success = False
216
-
217
- try:
218
- # Initialize Markdown Search
219
- if (search_type == state.SearchType.All.value or search_type == state.SearchType.Markdown.value) and files[
220
- "markdown"
221
- ]:
222
- logger.info("💎 Setting up search for markdown notes")
223
- # Extract Entries, Generate Markdown Embeddings
224
- text_search.setup(
225
- MarkdownToEntries,
226
- files.get("markdown"),
227
- regenerate=regenerate,
228
- full_corpus=full_corpus,
229
- user=user,
230
- )
231
-
232
- except Exception as e:
233
- logger.error(f"🚨 Failed to setup markdown: {e}", exc_info=True)
234
- success = False
235
-
236
- try:
237
- # Initialize PDF Search
238
- if (search_type == state.SearchType.All.value or search_type == state.SearchType.Pdf.value) and files["pdf"]:
239
- logger.info("🖨️ Setting up search for pdf")
240
- # Extract Entries, Generate PDF Embeddings
241
- text_search.setup(
242
- PdfToEntries,
243
- files.get("pdf"),
244
- regenerate=regenerate,
245
- full_corpus=full_corpus,
246
- user=user,
247
- )
248
-
249
- except Exception as e:
250
- logger.error(f"🚨 Failed to setup PDF: {e}", exc_info=True)
251
- success = False
252
-
253
- try:
254
- # Initialize Plaintext Search
255
- if (search_type == state.SearchType.All.value or search_type == state.SearchType.Plaintext.value) and files[
256
- "plaintext"
257
- ]:
258
- logger.info("📄 Setting up search for plaintext")
259
- # Extract Entries, Generate Plaintext Embeddings
260
- text_search.setup(
261
- PlaintextToEntries,
262
- files.get("plaintext"),
263
- regenerate=regenerate,
264
- full_corpus=full_corpus,
265
- user=user,
266
- )
267
-
268
- except Exception as e:
269
- logger.error(f"🚨 Failed to setup plaintext: {e}", exc_info=True)
270
- success = False
271
-
272
- try:
273
- if no_documents:
274
- github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first()
275
- if (
276
- search_type == state.SearchType.All.value or search_type == state.SearchType.Github.value
277
- ) and github_config is not None:
278
- logger.info("🐙 Setting up search for github")
279
- # Extract Entries, Generate Github Embeddings
280
- text_search.setup(
281
- GithubToEntries,
282
- None,
283
- regenerate=regenerate,
284
- full_corpus=full_corpus,
285
- user=user,
286
- config=github_config,
287
- )
288
-
289
- except Exception as e:
290
- logger.error(f"🚨 Failed to setup GitHub: {e}", exc_info=True)
291
- success = False
292
-
293
- try:
294
- if no_documents:
295
- # Initialize Notion Search
296
- notion_config = NotionConfig.objects.filter(user=user).first()
297
- if (
298
- search_type == state.SearchType.All.value or search_type == state.SearchType.Notion.value
299
- ) and notion_config:
300
- logger.info("🔌 Setting up search for notion")
301
- text_search.setup(
302
- NotionToEntries,
303
- None,
304
- regenerate=regenerate,
305
- full_corpus=full_corpus,
306
- user=user,
307
- config=notion_config,
308
- )
309
-
310
- except Exception as e:
311
- logger.error(f"🚨 Failed to setup Notion: {e}", exc_info=True)
312
- success = False
313
-
314
- try:
315
- # Initialize Image Search
316
- if (search_type == state.SearchType.All.value or search_type == state.SearchType.Image.value) and files[
317
- "image"
318
- ]:
319
- logger.info("🖼️ Setting up search for images")
320
- # Extract Entries, Generate Image Embeddings
321
- text_search.setup(
322
- ImageToEntries,
323
- files.get("image"),
324
- regenerate=regenerate,
325
- full_corpus=full_corpus,
326
- user=user,
327
- )
328
- except Exception as e:
329
- logger.error(f"🚨 Failed to setup images: {e}", exc_info=True)
330
- success = False
331
- try:
332
- if (search_type == state.SearchType.All.value or search_type == state.SearchType.Docx.value) and files["docx"]:
333
- logger.info("📄 Setting up search for docx")
334
- text_search.setup(
335
- DocxToEntries,
336
- files.get("docx"),
337
- regenerate=regenerate,
338
- full_corpus=full_corpus,
339
- user=user,
340
- )
341
- except Exception as e:
342
- logger.error(f"🚨 Failed to setup docx: {e}", exc_info=True)
343
- success = False
344
-
345
- # Invalidate Query Cache
346
- if user:
347
- state.query_cache[user.uuid] = LRU()
348
-
349
- return success