khoj 1.28.3__py3-none-any.whl → 1.28.4.dev92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. khoj/configure.py +10 -14
  2. khoj/database/adapters/__init__.py +128 -44
  3. khoj/database/admin.py +6 -3
  4. khoj/database/management/commands/change_default_model.py +7 -72
  5. khoj/database/migrations/0073_delete_usersearchmodelconfig.py +15 -0
  6. khoj/database/models/__init__.py +4 -6
  7. khoj/interface/compiled/404/index.html +1 -1
  8. khoj/interface/compiled/_next/static/chunks/1603-dc5fd983dbcd070d.js +1 -0
  9. khoj/interface/compiled/_next/static/chunks/1970-c78f6acc8e16e30b.js +1 -0
  10. khoj/interface/compiled/_next/static/chunks/2261-748f7c327df3c8c1.js +1 -0
  11. khoj/interface/compiled/_next/static/chunks/3124-a4cea2eda163128d.js +1 -0
  12. khoj/interface/compiled/_next/static/chunks/3803-d74118a2d0182c52.js +1 -0
  13. khoj/interface/compiled/_next/static/chunks/5538-36aa824a75519c5b.js +1 -0
  14. khoj/interface/compiled/_next/static/chunks/5961-3c104d9736b7902b.js +3 -0
  15. khoj/interface/compiled/_next/static/chunks/8423-ebfa9bb9e2424ca3.js +1 -0
  16. khoj/interface/compiled/_next/static/chunks/9417-32c4db52ca42e681.js +1 -0
  17. khoj/interface/compiled/_next/static/chunks/app/agents/layout-e9838b642913a071.js +1 -0
  18. khoj/interface/compiled/_next/static/chunks/app/agents/page-4353b1a532795ad1.js +1 -0
  19. khoj/interface/compiled/_next/static/chunks/app/automations/{page-d3edae545a1b5393.js → page-c9f13c865e739607.js} +1 -1
  20. khoj/interface/compiled/_next/static/chunks/app/chat/layout-b0e7ff4baa3b5265.js +1 -0
  21. khoj/interface/compiled/_next/static/chunks/app/chat/page-45720e1ed71e3ef5.js +1 -0
  22. khoj/interface/compiled/_next/static/chunks/app/{layout-d0f0a9067427fb20.js → layout-86561d2fac35a91a.js} +1 -1
  23. khoj/interface/compiled/_next/static/chunks/app/{page-ea462e20376b6dce.js → page-ecb8e1c192aa8834.js} +1 -1
  24. khoj/interface/compiled/_next/static/chunks/app/search/layout-ea6b73fdaf9b24ca.js +1 -0
  25. khoj/interface/compiled/_next/static/chunks/app/search/{page-a5c277eff207959e.js → page-8e28deacb61f75aa.js} +1 -1
  26. khoj/interface/compiled/_next/static/chunks/app/settings/{layout-a8f33dfe92f997fb.js → layout-254eaaf916449a60.js} +1 -1
  27. khoj/interface/compiled/_next/static/chunks/app/settings/page-2fab613a557d3cc5.js +1 -0
  28. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-cf7445cf0326bda3.js +1 -0
  29. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-30376aa7e9cfa342.js +1 -0
  30. khoj/interface/compiled/_next/static/chunks/{main-f84cd3c1873cd842.js → main-1ea5c2e0fdef4626.js} +1 -1
  31. khoj/interface/compiled/_next/static/chunks/{webpack-8beec5b51cabb39a.js → webpack-27cf153c35b1338d.js} +1 -1
  32. khoj/interface/compiled/_next/static/css/{467a524c75e7d7c0.css → 0e9d53dcd7f11342.css} +1 -1
  33. khoj/interface/compiled/_next/static/css/{26c1c33d0423a7d8.css → 1f293605f2871853.css} +1 -1
  34. khoj/interface/compiled/_next/static/css/2d097a35da6bfe8d.css +1 -0
  35. khoj/interface/compiled/_next/static/css/80bd6301fc657983.css +1 -0
  36. khoj/interface/compiled/_next/static/css/ed437164d77aa600.css +25 -0
  37. khoj/interface/compiled/_next/static/media/5455839c73f146e7-s.p.woff2 +0 -0
  38. khoj/interface/compiled/_next/static/media/5984b96ba4822821-s.woff2 +0 -0
  39. khoj/interface/compiled/_next/static/media/684adc3dde1b03f1-s.woff2 +0 -0
  40. khoj/interface/compiled/_next/static/media/82e3b9a1bdaf0c26-s.woff2 +0 -0
  41. khoj/interface/compiled/_next/static/media/8d1ea331386a0db8-s.woff2 +0 -0
  42. khoj/interface/compiled/_next/static/media/91475f6526542a4f-s.woff2 +0 -0
  43. khoj/interface/compiled/_next/static/media/b98b13dbc1c3b59c-s.woff2 +0 -0
  44. khoj/interface/compiled/_next/static/media/c824d7a20139e39d-s.woff2 +0 -0
  45. khoj/interface/compiled/agents/index.html +1 -1
  46. khoj/interface/compiled/agents/index.txt +2 -2
  47. khoj/interface/compiled/automations/index.html +1 -1
  48. khoj/interface/compiled/automations/index.txt +2 -2
  49. khoj/interface/compiled/chat/index.html +1 -1
  50. khoj/interface/compiled/chat/index.txt +2 -2
  51. khoj/interface/compiled/index.html +1 -1
  52. khoj/interface/compiled/index.txt +3 -3
  53. khoj/interface/compiled/search/index.html +1 -1
  54. khoj/interface/compiled/search/index.txt +2 -2
  55. khoj/interface/compiled/settings/index.html +1 -1
  56. khoj/interface/compiled/settings/index.txt +3 -3
  57. khoj/interface/compiled/share/chat/index.html +1 -1
  58. khoj/interface/compiled/share/chat/index.txt +3 -3
  59. khoj/processor/content/docx/docx_to_entries.py +27 -21
  60. khoj/processor/content/github/github_to_entries.py +2 -2
  61. khoj/processor/content/images/image_to_entries.py +2 -2
  62. khoj/processor/content/markdown/markdown_to_entries.py +2 -2
  63. khoj/processor/content/notion/notion_to_entries.py +2 -2
  64. khoj/processor/content/org_mode/org_to_entries.py +2 -2
  65. khoj/processor/content/org_mode/orgnode.py +1 -1
  66. khoj/processor/content/pdf/pdf_to_entries.py +37 -29
  67. khoj/processor/content/plaintext/plaintext_to_entries.py +2 -2
  68. khoj/processor/content/text_to_entries.py +3 -4
  69. khoj/processor/conversation/anthropic/anthropic_chat.py +9 -1
  70. khoj/processor/conversation/google/gemini_chat.py +15 -2
  71. khoj/processor/conversation/google/utils.py +3 -1
  72. khoj/processor/conversation/offline/chat_model.py +4 -0
  73. khoj/processor/conversation/openai/gpt.py +6 -1
  74. khoj/processor/conversation/prompts.py +72 -13
  75. khoj/processor/conversation/utils.py +80 -13
  76. khoj/processor/image/generate.py +2 -0
  77. khoj/processor/tools/online_search.py +68 -18
  78. khoj/processor/tools/run_code.py +54 -20
  79. khoj/routers/api.py +10 -4
  80. khoj/routers/api_agents.py +8 -10
  81. khoj/routers/api_chat.py +89 -24
  82. khoj/routers/api_content.py +80 -8
  83. khoj/routers/helpers.py +176 -60
  84. khoj/routers/notion.py +1 -1
  85. khoj/routers/research.py +73 -31
  86. khoj/routers/web_client.py +0 -10
  87. khoj/search_type/text_search.py +3 -7
  88. khoj/utils/cli.py +2 -2
  89. khoj/utils/fs_syncer.py +2 -1
  90. khoj/utils/helpers.py +6 -3
  91. khoj/utils/rawconfig.py +32 -0
  92. khoj/utils/state.py +2 -1
  93. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/METADATA +3 -3
  94. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/RECORD +99 -105
  95. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/WHEEL +1 -1
  96. khoj/interface/compiled/_next/static/chunks/1034-da58b679fcbb79c1.js +0 -1
  97. khoj/interface/compiled/_next/static/chunks/1467-b331e469fe411347.js +0 -1
  98. khoj/interface/compiled/_next/static/chunks/1603-c1568f45947e9f2c.js +0 -1
  99. khoj/interface/compiled/_next/static/chunks/1970-d44050bf658ae5cc.js +0 -1
  100. khoj/interface/compiled/_next/static/chunks/3110-ef2cacd1b8d79ad8.js +0 -1
  101. khoj/interface/compiled/_next/static/chunks/3423-f4b7df2f6f3362f7.js +0 -1
  102. khoj/interface/compiled/_next/static/chunks/394-6bcb8c429f168f21.js +0 -3
  103. khoj/interface/compiled/_next/static/chunks/7113-f2e114d7034a0835.js +0 -1
  104. khoj/interface/compiled/_next/static/chunks/8423-da57554315eebcbe.js +0 -1
  105. khoj/interface/compiled/_next/static/chunks/8840-b8d7b9f0923c6651.js +0 -1
  106. khoj/interface/compiled/_next/static/chunks/9417-0d0fc7eb49a86abb.js +0 -1
  107. khoj/interface/compiled/_next/static/chunks/app/agents/layout-75636ab3a413fa8e.js +0 -1
  108. khoj/interface/compiled/_next/static/chunks/app/agents/page-adbf3cd470da248f.js +0 -1
  109. khoj/interface/compiled/_next/static/chunks/app/chat/layout-96fcf62857bf8f30.js +0 -1
  110. khoj/interface/compiled/_next/static/chunks/app/chat/page-222d348681b848a5.js +0 -1
  111. khoj/interface/compiled/_next/static/chunks/app/factchecker/layout-7b30c541c05fb904.js +0 -1
  112. khoj/interface/compiled/_next/static/chunks/app/factchecker/page-bded0868a08ac4ba.js +0 -1
  113. khoj/interface/compiled/_next/static/chunks/app/search/layout-3720f1362310bebb.js +0 -1
  114. khoj/interface/compiled/_next/static/chunks/app/settings/page-210bd54db4841333.js +0 -1
  115. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-2df56074e42adaa0.js +0 -1
  116. khoj/interface/compiled/_next/static/chunks/app/share/chat/page-a21b7e8890ed1209.js +0 -1
  117. khoj/interface/compiled/_next/static/css/4cae6c0e5c72fb2d.css +0 -1
  118. khoj/interface/compiled/_next/static/css/553f9cdcc7a2bcd6.css +0 -1
  119. khoj/interface/compiled/_next/static/css/a795ee88875f4853.css +0 -25
  120. khoj/interface/compiled/_next/static/css/afd3d45cc65d55d8.css +0 -1
  121. khoj/interface/compiled/_next/static/media/0e790e04fd40ad16-s.p.woff2 +0 -0
  122. khoj/interface/compiled/_next/static/media/4221e1667cd19c7d-s.woff2 +0 -0
  123. khoj/interface/compiled/_next/static/media/6c276159aa0eb14b-s.woff2 +0 -0
  124. khoj/interface/compiled/_next/static/media/6cc0b9500e4f9168-s.woff2 +0 -0
  125. khoj/interface/compiled/_next/static/media/9d9319a7a2ac39c6-s.woff2 +0 -0
  126. khoj/interface/compiled/_next/static/media/a75c8ea86756d52d-s.woff2 +0 -0
  127. khoj/interface/compiled/_next/static/media/abce7c400ca31a51-s.woff2 +0 -0
  128. khoj/interface/compiled/_next/static/media/f759c939737fb668-s.woff2 +0 -0
  129. khoj/interface/compiled/factchecker/index.html +0 -1
  130. khoj/interface/compiled/factchecker/index.txt +0 -7
  131. /khoj/interface/compiled/_next/static/{EfnEiWDle86AUcxEdEFgO → t_2jovvUVve0Gvc3FqpT9}/_buildManifest.js +0 -0
  132. /khoj/interface/compiled/_next/static/{EfnEiWDle86AUcxEdEFgO → t_2jovvUVve0Gvc3FqpT9}/_ssgManifest.js +0 -0
  133. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/entry_points.txt +0 -0
  134. {khoj-1.28.3.dist-info → khoj-1.28.4.dev92.dist-info}/licenses/LICENSE +0 -0
@@ -36,16 +36,18 @@ from khoj.database.models import (
36
36
  LocalPlaintextConfig,
37
37
  NotionConfig,
38
38
  )
39
+ from khoj.processor.content.docx.docx_to_entries import DocxToEntries
40
+ from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
39
41
  from khoj.routers.helpers import (
40
42
  ApiIndexedDataLimiter,
41
43
  CommonQueryParams,
42
44
  configure_content,
45
+ get_file_content,
43
46
  get_user_config,
44
47
  update_telemetry_state,
45
48
  )
46
49
  from khoj.utils import constants, state
47
50
  from khoj.utils.config import SearchModels
48
- from khoj.utils.helpers import get_file_type
49
51
  from khoj.utils.rawconfig import (
50
52
  ContentConfig,
51
53
  FullConfig,
@@ -237,7 +239,7 @@ async def set_content_notion(
237
239
 
238
240
  if updated_config.token:
239
241
  # Trigger an async job to configure_content. Let it run without blocking the response.
240
- background_tasks.add_task(run_in_executor, configure_content, {}, False, SearchType.Notion, user)
242
+ background_tasks.add_task(run_in_executor, configure_content, user, {}, False, SearchType.Notion)
241
243
 
242
244
  update_telemetry_state(
243
245
  request=request,
@@ -375,6 +377,75 @@ async def delete_content_source(
375
377
  return {"status": "ok"}
376
378
 
377
379
 
380
+ @api_content.post("/convert", status_code=200)
381
+ @requires(["authenticated"])
382
+ async def convert_documents(
383
+ request: Request,
384
+ files: List[UploadFile],
385
+ client: Optional[str] = None,
386
+ ):
387
+ MAX_FILE_SIZE_MB = 10 # 10MB limit
388
+ MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
389
+
390
+ converted_files = []
391
+ supported_files = ["org", "markdown", "pdf", "plaintext", "docx"]
392
+
393
+ for file in files:
394
+ # Check file size first
395
+ file_size = 0
396
+ content = await file.read()
397
+ file_size = len(content)
398
+ await file.seek(0) # Reset file pointer
399
+
400
+ if file_size > MAX_FILE_SIZE_BYTES:
401
+ logger.warning(
402
+ f"Skipped converting oversized file ({file_size / 1024 / 1024:.1f}MB) sent by {client} client: {file.filename}"
403
+ )
404
+ continue
405
+
406
+ file_data = get_file_content(file)
407
+ if file_data.file_type in supported_files:
408
+ extracted_content = (
409
+ file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
410
+ )
411
+
412
+ if file_data.file_type == "docx":
413
+ entries_per_page = DocxToEntries.extract_text(file_data.content)
414
+ annotated_pages = [
415
+ f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page)
416
+ ]
417
+ extracted_content = "\n".join(annotated_pages)
418
+
419
+ elif file_data.file_type == "pdf":
420
+ entries_per_page = PdfToEntries.extract_text(file_data.content)
421
+ annotated_pages = [
422
+ f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page)
423
+ ]
424
+ extracted_content = "\n".join(annotated_pages)
425
+
426
+ size_in_bytes = len(extracted_content.encode("utf-8"))
427
+
428
+ converted_files.append(
429
+ {
430
+ "name": file_data.name,
431
+ "content": extracted_content,
432
+ "file_type": file_data.file_type,
433
+ "size": size_in_bytes,
434
+ }
435
+ )
436
+ else:
437
+ logger.warning(f"Skipped converting unsupported file type sent by {client} client: {file.filename}")
438
+
439
+ update_telemetry_state(
440
+ request=request,
441
+ telemetry_type="api",
442
+ api="convert_documents",
443
+ client=client,
444
+ )
445
+
446
+ return Response(content=json.dumps(converted_files), media_type="application/json", status_code=200)
447
+
448
+
378
449
  async def indexer(
379
450
  request: Request,
380
451
  files: list[UploadFile],
@@ -398,12 +469,13 @@ async def indexer(
398
469
  try:
399
470
  logger.info(f"📬 Updating content index via API call by {client} client")
400
471
  for file in files:
401
- file_content = file.file.read()
402
- file_type, encoding = get_file_type(file.content_type, file_content)
403
- if file_type in index_files:
404
- index_files[file_type][file.filename] = file_content.decode(encoding) if encoding else file_content
472
+ file_data = get_file_content(file)
473
+ if file_data.file_type in index_files:
474
+ index_files[file_data.file_type][file_data.name] = (
475
+ file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
476
+ )
405
477
  else:
406
- logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
478
+ logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file_data.name}")
407
479
 
408
480
  indexer_input = IndexerInput(
409
481
  org=index_files["org"],
@@ -440,10 +512,10 @@ async def indexer(
440
512
  success = await loop.run_in_executor(
441
513
  None,
442
514
  configure_content,
515
+ user,
443
516
  indexer_input.model_dump(),
444
517
  regenerate,
445
518
  t,
446
- user,
447
519
  )
448
520
  if not success:
449
521
  raise RuntimeError(f"Failed to {method} {t} data sent by {client} client into content index")