khoj 1.28.4.dev23__py3-none-any.whl → 1.28.4.dev77__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. khoj/configure.py +4 -6
  2. khoj/database/adapters/__init__.py +124 -34
  3. khoj/database/models/__init__.py +4 -0
  4. khoj/interface/compiled/404/index.html +1 -1
  5. khoj/interface/compiled/_next/static/chunks/1603-2418b11d8e8dacb9.js +1 -0
  6. khoj/interface/compiled/_next/static/chunks/1970-c78f6acc8e16e30b.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/3124-a4cea2eda163128d.js +1 -0
  8. khoj/interface/compiled/_next/static/chunks/5538-5c4f2271e9377b74.js +1 -0
  9. khoj/interface/compiled/_next/static/chunks/8423-db6dad6d44869097.js +1 -0
  10. khoj/interface/compiled/_next/static/chunks/9417-7a8a6da918d37750.js +1 -0
  11. khoj/interface/compiled/_next/static/chunks/app/agents/{page-36da67f03a173e52.js → page-4353b1a532795ad1.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/automations/{page-774ae3e033f938cd.js → page-c9f13c865e739607.js} +1 -1
  13. khoj/interface/compiled/_next/static/chunks/app/chat/page-97876b3bd3c5e69d.js +1 -0
  14. khoj/interface/compiled/_next/static/chunks/app/{page-322c37514a3a613a.js → page-c33ebe19a3b7b0b2.js} +1 -1
  15. khoj/interface/compiled/_next/static/chunks/app/search/{page-9b64f61caa5bd7f9.js → page-8e28deacb61f75aa.js} +1 -1
  16. khoj/interface/compiled/_next/static/chunks/app/settings/page-2fab613a557d3cc5.js +1 -0
  17. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-3ee3da7e8dfe3572.js +1 -0
  18. khoj/interface/compiled/_next/static/chunks/{webpack-c9799fdebf88abb6.js → webpack-ff5eae43b8dba1d2.js} +1 -1
  19. khoj/interface/compiled/_next/static/css/23f801d22927d568.css +1 -0
  20. khoj/interface/compiled/_next/static/css/592ca99f5122e75a.css +1 -0
  21. khoj/interface/compiled/_next/static/css/af0f36f71f368260.css +25 -0
  22. khoj/interface/compiled/agents/index.html +1 -1
  23. khoj/interface/compiled/agents/index.txt +2 -2
  24. khoj/interface/compiled/automations/index.html +1 -1
  25. khoj/interface/compiled/automations/index.txt +2 -2
  26. khoj/interface/compiled/chat/index.html +1 -1
  27. khoj/interface/compiled/chat/index.txt +2 -2
  28. khoj/interface/compiled/index.html +1 -1
  29. khoj/interface/compiled/index.txt +3 -3
  30. khoj/interface/compiled/search/index.html +1 -1
  31. khoj/interface/compiled/search/index.txt +2 -2
  32. khoj/interface/compiled/settings/index.html +1 -1
  33. khoj/interface/compiled/settings/index.txt +2 -2
  34. khoj/interface/compiled/share/chat/index.html +1 -1
  35. khoj/interface/compiled/share/chat/index.txt +3 -3
  36. khoj/processor/content/docx/docx_to_entries.py +27 -21
  37. khoj/processor/content/github/github_to_entries.py +2 -2
  38. khoj/processor/content/images/image_to_entries.py +2 -2
  39. khoj/processor/content/markdown/markdown_to_entries.py +2 -2
  40. khoj/processor/content/notion/notion_to_entries.py +2 -2
  41. khoj/processor/content/org_mode/org_to_entries.py +2 -2
  42. khoj/processor/content/pdf/pdf_to_entries.py +37 -29
  43. khoj/processor/content/plaintext/plaintext_to_entries.py +2 -2
  44. khoj/processor/content/text_to_entries.py +2 -2
  45. khoj/processor/conversation/anthropic/anthropic_chat.py +7 -1
  46. khoj/processor/conversation/google/gemini_chat.py +15 -2
  47. khoj/processor/conversation/offline/chat_model.py +4 -0
  48. khoj/processor/conversation/openai/gpt.py +6 -1
  49. khoj/processor/conversation/prompts.py +48 -4
  50. khoj/processor/conversation/utils.py +69 -11
  51. khoj/processor/image/generate.py +2 -0
  52. khoj/processor/tools/online_search.py +19 -3
  53. khoj/processor/tools/run_code.py +4 -0
  54. khoj/routers/api.py +6 -1
  55. khoj/routers/api_agents.py +8 -10
  56. khoj/routers/api_chat.py +64 -13
  57. khoj/routers/api_content.py +80 -8
  58. khoj/routers/helpers.py +105 -34
  59. khoj/routers/notion.py +1 -1
  60. khoj/routers/research.py +9 -2
  61. khoj/search_type/text_search.py +1 -1
  62. khoj/utils/fs_syncer.py +2 -1
  63. khoj/utils/rawconfig.py +32 -0
  64. {khoj-1.28.4.dev23.dist-info → khoj-1.28.4.dev77.dist-info}/METADATA +1 -1
  65. {khoj-1.28.4.dev23.dist-info → khoj-1.28.4.dev77.dist-info}/RECORD +70 -70
  66. khoj/interface/compiled/_next/static/chunks/1603-c1568f45947e9f2c.js +0 -1
  67. khoj/interface/compiled/_next/static/chunks/1970-d44050bf658ae5cc.js +0 -1
  68. khoj/interface/compiled/_next/static/chunks/5538-bf582517a8dd3faa.js +0 -1
  69. khoj/interface/compiled/_next/static/chunks/8423-a1f432e4a8d9a6b0.js +0 -1
  70. khoj/interface/compiled/_next/static/chunks/8840-b8d7b9f0923c6651.js +0 -1
  71. khoj/interface/compiled/_next/static/chunks/9417-0d0fc7eb49a86abb.js +0 -1
  72. khoj/interface/compiled/_next/static/chunks/app/chat/page-a369e2bda9897794.js +0 -1
  73. khoj/interface/compiled/_next/static/chunks/app/settings/page-10b288c103f19468.js +0 -1
  74. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-959d5f097cf38c93.js +0 -1
  75. khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +0 -1
  76. khoj/interface/compiled/_next/static/css/9d45de78fba367c1.css +0 -1
  77. khoj/interface/compiled/_next/static/css/d2bc549245313f26.css +0 -25
  78. /khoj/interface/compiled/_next/static/{s_mKS5kELaw2v4a7_yWNP → sE94pAZEifEKkz4WQtTNW}/_buildManifest.js +0 -0
  79. /khoj/interface/compiled/_next/static/{s_mKS5kELaw2v4a7_yWNP → sE94pAZEifEKkz4WQtTNW}/_ssgManifest.js +0 -0
  80. {khoj-1.28.4.dev23.dist-info → khoj-1.28.4.dev77.dist-info}/WHEEL +0 -0
  81. {khoj-1.28.4.dev23.dist-info → khoj-1.28.4.dev77.dist-info}/entry_points.txt +0 -0
  82. {khoj-1.28.4.dev23.dist-info → khoj-1.28.4.dev77.dist-info}/licenses/LICENSE +0 -0
khoj/routers/api_chat.py CHANGED
@@ -19,7 +19,6 @@ from khoj.database.adapters import (
19
19
  AgentAdapters,
20
20
  ConversationAdapters,
21
21
  EntryAdapters,
22
- FileObjectAdapters,
23
22
  PublicConversationAdapters,
24
23
  aget_user_name,
25
24
  )
@@ -45,12 +44,13 @@ from khoj.routers.helpers import (
45
44
  ConversationCommandRateLimiter,
46
45
  DeleteMessageRequestBody,
47
46
  FeedbackData,
47
+ acreate_title_from_history,
48
48
  agenerate_chat_response,
49
49
  aget_relevant_information_sources,
50
50
  aget_relevant_output_modes,
51
51
  construct_automation_created_message,
52
52
  create_automation,
53
- extract_relevant_info,
53
+ gather_raw_query_files,
54
54
  generate_excalidraw_diagram,
55
55
  generate_summary_from_files,
56
56
  get_conversation_command,
@@ -76,7 +76,12 @@ from khoj.utils.helpers import (
76
76
  get_device,
77
77
  is_none_or_empty,
78
78
  )
79
- from khoj.utils.rawconfig import FileFilterRequest, FilesFilterRequest, LocationData
79
+ from khoj.utils.rawconfig import (
80
+ ChatRequestBody,
81
+ FileFilterRequest,
82
+ FilesFilterRequest,
83
+ LocationData,
84
+ )
80
85
 
81
86
  # Initialize Router
82
87
  logger = logging.getLogger(__name__)
@@ -374,7 +379,7 @@ def fork_public_conversation(
374
379
  {
375
380
  "status": "ok",
376
381
  "next_url": redirect_uri,
377
- "conversation_id": new_conversation.id,
382
+ "conversation_id": str(new_conversation.id),
378
383
  }
379
384
  ),
380
385
  )
@@ -530,6 +535,32 @@ async def set_conversation_title(
530
535
  )
531
536
 
532
537
 
538
+ @api_chat.post("/title")
539
+ @requires(["authenticated"])
540
+ async def generate_chat_title(
541
+ request: Request,
542
+ common: CommonQueryParams,
543
+ conversation_id: str,
544
+ ):
545
+ user: KhojUser = request.user.object
546
+ conversation = await ConversationAdapters.aget_conversation_by_user(user=user, conversation_id=conversation_id)
547
+
548
+ # Conversation.title is explicitly set by the user. Do not override.
549
+ if conversation.title:
550
+ return {"status": "ok", "title": conversation.title}
551
+
552
+ if not conversation:
553
+ raise HTTPException(status_code=404, detail="Conversation not found")
554
+
555
+ new_title = await acreate_title_from_history(request.user.object, conversation=conversation)
556
+
557
+ conversation.slug = new_title
558
+
559
+ conversation.asave()
560
+
561
+ return {"status": "ok", "title": new_title}
562
+
563
+
533
564
  @api_chat.delete("/conversation/message", response_class=Response)
534
565
  @requires(["authenticated"])
535
566
  def delete_message(request: Request, delete_request: DeleteMessageRequestBody) -> Response:
@@ -571,6 +602,7 @@ async def chat(
571
602
  country_code = body.country_code or get_country_code_from_timezone(body.timezone)
572
603
  timezone = body.timezone
573
604
  raw_images = body.images
605
+ raw_query_files = body.files
574
606
 
575
607
  async def event_generator(q: str, images: list[str]):
576
608
  start_time = time.perf_counter()
@@ -582,6 +614,7 @@ async def chat(
582
614
  q = unquote(q)
583
615
  train_of_thought = []
584
616
  nonlocal conversation_id
617
+ nonlocal raw_query_files
585
618
 
586
619
  tracer: dict = {
587
620
  "mid": turn_id,
@@ -601,6 +634,11 @@ async def chat(
601
634
  if uploaded_image:
602
635
  uploaded_images.append(uploaded_image)
603
636
 
637
+ query_files: Dict[str, str] = {}
638
+ if raw_query_files:
639
+ for file in raw_query_files:
640
+ query_files[file.name] = file.content
641
+
604
642
  async def send_event(event_type: ChatEvent, data: str | dict):
605
643
  nonlocal connection_alive, ttft, train_of_thought
606
644
  if not connection_alive or await request.is_disconnected():
@@ -711,6 +749,8 @@ async def chat(
711
749
  ## Extract Document References
712
750
  compiled_references: List[Any] = []
713
751
  inferred_queries: List[Any] = []
752
+ file_filters = conversation.file_filters if conversation and conversation.file_filters else []
753
+ attached_file_context = gather_raw_query_files(query_files)
714
754
 
715
755
  if conversation_commands == [ConversationCommand.Default] or is_automated_task:
716
756
  conversation_commands = await aget_relevant_information_sources(
@@ -720,6 +760,7 @@ async def chat(
720
760
  user=user,
721
761
  query_images=uploaded_images,
722
762
  agent=agent,
763
+ query_files=attached_file_context,
723
764
  tracer=tracer,
724
765
  )
725
766
 
@@ -765,6 +806,7 @@ async def chat(
765
806
  user_name=user_name,
766
807
  location=location,
767
808
  file_filters=conversation.file_filters if conversation else [],
809
+ query_files=attached_file_context,
768
810
  tracer=tracer,
769
811
  ):
770
812
  if isinstance(research_result, InformationCollectionIteration):
@@ -804,10 +846,6 @@ async def chat(
804
846
  response_log = "No files selected for summarization. Please add files using the section on the left."
805
847
  async for result in send_llm_response(response_log):
806
848
  yield result
807
- elif len(file_filters) > 1 and not agent_has_entries:
808
- response_log = "Only one file can be selected for summarization."
809
- async for result in send_llm_response(response_log):
810
- yield result
811
849
  else:
812
850
  async for response in generate_summary_from_files(
813
851
  q=q,
@@ -817,6 +855,7 @@ async def chat(
817
855
  query_images=uploaded_images,
818
856
  agent=agent,
819
857
  send_status_func=partial(send_event, ChatEvent.STATUS),
858
+ query_files=attached_file_context,
820
859
  tracer=tracer,
821
860
  ):
822
861
  if isinstance(response, dict) and ChatEvent.STATUS in response:
@@ -837,8 +876,9 @@ async def chat(
837
876
  client_application=request.user.client_app,
838
877
  conversation_id=conversation_id,
839
878
  query_images=uploaded_images,
840
- tracer=tracer,
841
879
  train_of_thought=train_of_thought,
880
+ raw_query_files=raw_query_files,
881
+ tracer=tracer,
842
882
  )
843
883
  return
844
884
 
@@ -882,8 +922,9 @@ async def chat(
882
922
  inferred_queries=[query_to_run],
883
923
  automation_id=automation.id,
884
924
  query_images=uploaded_images,
885
- tracer=tracer,
886
925
  train_of_thought=train_of_thought,
926
+ raw_query_files=raw_query_files,
927
+ tracer=tracer,
887
928
  )
888
929
  async for result in send_llm_response(llm_response):
889
930
  yield result
@@ -905,6 +946,7 @@ async def chat(
905
946
  partial(send_event, ChatEvent.STATUS),
906
947
  query_images=uploaded_images,
907
948
  agent=agent,
949
+ query_files=attached_file_context,
908
950
  tracer=tracer,
909
951
  ):
910
952
  if isinstance(result, dict) and ChatEvent.STATUS in result:
@@ -950,6 +992,7 @@ async def chat(
950
992
  custom_filters,
951
993
  query_images=uploaded_images,
952
994
  agent=agent,
995
+ query_files=attached_file_context,
953
996
  tracer=tracer,
954
997
  ):
955
998
  if isinstance(result, dict) and ChatEvent.STATUS in result:
@@ -975,6 +1018,7 @@ async def chat(
975
1018
  partial(send_event, ChatEvent.STATUS),
976
1019
  query_images=uploaded_images,
977
1020
  agent=agent,
1021
+ query_files=attached_file_context,
978
1022
  tracer=tracer,
979
1023
  ):
980
1024
  if isinstance(result, dict) and ChatEvent.STATUS in result:
@@ -1015,6 +1059,7 @@ async def chat(
1015
1059
  partial(send_event, ChatEvent.STATUS),
1016
1060
  query_images=uploaded_images,
1017
1061
  agent=agent,
1062
+ query_files=attached_file_context,
1018
1063
  tracer=tracer,
1019
1064
  ):
1020
1065
  if isinstance(result, dict) and ChatEvent.STATUS in result:
@@ -1055,6 +1100,7 @@ async def chat(
1055
1100
  send_status_func=partial(send_event, ChatEvent.STATUS),
1056
1101
  query_images=uploaded_images,
1057
1102
  agent=agent,
1103
+ query_files=attached_file_context,
1058
1104
  tracer=tracer,
1059
1105
  ):
1060
1106
  if isinstance(result, dict) and ChatEvent.STATUS in result:
@@ -1086,8 +1132,9 @@ async def chat(
1086
1132
  compiled_references=compiled_references,
1087
1133
  online_results=online_results,
1088
1134
  query_images=uploaded_images,
1089
- tracer=tracer,
1090
1135
  train_of_thought=train_of_thought,
1136
+ raw_query_files=raw_query_files,
1137
+ tracer=tracer,
1091
1138
  )
1092
1139
  content_obj = {
1093
1140
  "intentType": intent_type,
@@ -1116,6 +1163,7 @@ async def chat(
1116
1163
  user=user,
1117
1164
  agent=agent,
1118
1165
  send_status_func=partial(send_event, ChatEvent.STATUS),
1166
+ query_files=attached_file_context,
1119
1167
  tracer=tracer,
1120
1168
  ):
1121
1169
  if isinstance(result, dict) and ChatEvent.STATUS in result:
@@ -1144,8 +1192,9 @@ async def chat(
1144
1192
  compiled_references=compiled_references,
1145
1193
  online_results=online_results,
1146
1194
  query_images=uploaded_images,
1147
- tracer=tracer,
1148
1195
  train_of_thought=train_of_thought,
1196
+ raw_query_files=raw_query_files,
1197
+ tracer=tracer,
1149
1198
  )
1150
1199
 
1151
1200
  async for result in send_llm_response(json.dumps(content_obj)):
@@ -1171,8 +1220,10 @@ async def chat(
1171
1220
  user_name,
1172
1221
  researched_results,
1173
1222
  uploaded_images,
1174
- tracer,
1175
1223
  train_of_thought,
1224
+ attached_file_context,
1225
+ raw_query_files,
1226
+ tracer,
1176
1227
  )
1177
1228
 
1178
1229
  # Send Response
@@ -36,16 +36,18 @@ from khoj.database.models import (
36
36
  LocalPlaintextConfig,
37
37
  NotionConfig,
38
38
  )
39
+ from khoj.processor.content.docx.docx_to_entries import DocxToEntries
40
+ from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
39
41
  from khoj.routers.helpers import (
40
42
  ApiIndexedDataLimiter,
41
43
  CommonQueryParams,
42
44
  configure_content,
45
+ get_file_content,
43
46
  get_user_config,
44
47
  update_telemetry_state,
45
48
  )
46
49
  from khoj.utils import constants, state
47
50
  from khoj.utils.config import SearchModels
48
- from khoj.utils.helpers import get_file_type
49
51
  from khoj.utils.rawconfig import (
50
52
  ContentConfig,
51
53
  FullConfig,
@@ -237,7 +239,7 @@ async def set_content_notion(
237
239
 
238
240
  if updated_config.token:
239
241
  # Trigger an async job to configure_content. Let it run without blocking the response.
240
- background_tasks.add_task(run_in_executor, configure_content, {}, False, SearchType.Notion, user)
242
+ background_tasks.add_task(run_in_executor, configure_content, user, {}, False, SearchType.Notion)
241
243
 
242
244
  update_telemetry_state(
243
245
  request=request,
@@ -375,6 +377,75 @@ async def delete_content_source(
375
377
  return {"status": "ok"}
376
378
 
377
379
 
380
+ @api_content.post("/convert", status_code=200)
381
+ @requires(["authenticated"])
382
+ async def convert_documents(
383
+ request: Request,
384
+ files: List[UploadFile],
385
+ client: Optional[str] = None,
386
+ ):
387
+ MAX_FILE_SIZE_MB = 10 # 10MB limit
388
+ MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
389
+
390
+ converted_files = []
391
+ supported_files = ["org", "markdown", "pdf", "plaintext", "docx"]
392
+
393
+ for file in files:
394
+ # Check file size first
395
+ file_size = 0
396
+ content = await file.read()
397
+ file_size = len(content)
398
+ await file.seek(0) # Reset file pointer
399
+
400
+ if file_size > MAX_FILE_SIZE_BYTES:
401
+ logger.warning(
402
+ f"Skipped converting oversized file ({file_size / 1024 / 1024:.1f}MB) sent by {client} client: {file.filename}"
403
+ )
404
+ continue
405
+
406
+ file_data = get_file_content(file)
407
+ if file_data.file_type in supported_files:
408
+ extracted_content = (
409
+ file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
410
+ )
411
+
412
+ if file_data.file_type == "docx":
413
+ entries_per_page = DocxToEntries.extract_text(file_data.content)
414
+ annotated_pages = [
415
+ f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page)
416
+ ]
417
+ extracted_content = "\n".join(annotated_pages)
418
+
419
+ elif file_data.file_type == "pdf":
420
+ entries_per_page = PdfToEntries.extract_text(file_data.content)
421
+ annotated_pages = [
422
+ f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page)
423
+ ]
424
+ extracted_content = "\n".join(annotated_pages)
425
+
426
+ size_in_bytes = len(extracted_content.encode("utf-8"))
427
+
428
+ converted_files.append(
429
+ {
430
+ "name": file_data.name,
431
+ "content": extracted_content,
432
+ "file_type": file_data.file_type,
433
+ "size": size_in_bytes,
434
+ }
435
+ )
436
+ else:
437
+ logger.warning(f"Skipped converting unsupported file type sent by {client} client: {file.filename}")
438
+
439
+ update_telemetry_state(
440
+ request=request,
441
+ telemetry_type="api",
442
+ api="convert_documents",
443
+ client=client,
444
+ )
445
+
446
+ return Response(content=json.dumps(converted_files), media_type="application/json", status_code=200)
447
+
448
+
378
449
  async def indexer(
379
450
  request: Request,
380
451
  files: list[UploadFile],
@@ -398,12 +469,13 @@ async def indexer(
398
469
  try:
399
470
  logger.info(f"📬 Updating content index via API call by {client} client")
400
471
  for file in files:
401
- file_content = file.file.read()
402
- file_type, encoding = get_file_type(file.content_type, file_content)
403
- if file_type in index_files:
404
- index_files[file_type][file.filename] = file_content.decode(encoding) if encoding else file_content
472
+ file_data = get_file_content(file)
473
+ if file_data.file_type in index_files:
474
+ index_files[file_data.file_type][file_data.name] = (
475
+ file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
476
+ )
405
477
  else:
406
- logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
478
+ logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file_data.name}")
407
479
 
408
480
  indexer_input = IndexerInput(
409
481
  org=index_files["org"],
@@ -440,10 +512,10 @@ async def indexer(
440
512
  success = await loop.run_in_executor(
441
513
  None,
442
514
  configure_content,
515
+ user,
443
516
  indexer_input.model_dump(),
444
517
  regenerate,
445
518
  t,
446
- user,
447
519
  )
448
520
  if not success:
449
521
  raise RuntimeError(f"Failed to {method} {t} data sent by {client} client into content index")