khoj 1.42.8.dev6__py3-none-any.whl → 1.42.9.dev16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. khoj/database/adapters/__init__.py +20 -0
  2. khoj/interface/compiled/404/index.html +2 -2
  3. khoj/interface/compiled/_next/static/chunks/app/agents/layout-2e626327abfbe612.js +1 -0
  4. khoj/interface/compiled/_next/static/chunks/app/agents/{page-9a4610474cd59a71.js → page-0006674668eb5a4d.js} +1 -1
  5. khoj/interface/compiled/_next/static/chunks/app/automations/{page-f7bb9d777b7745d4.js → page-4c465cde2d14cb52.js} +1 -1
  6. khoj/interface/compiled/_next/static/chunks/app/chat/layout-d6acbba22ccac0ff.js +1 -0
  7. khoj/interface/compiled/_next/static/chunks/app/chat/{page-ef738950ea1babc3.js → page-9967631715682f3c.js} +1 -1
  8. khoj/interface/compiled/_next/static/chunks/app/{page-2b3056cba8aa96ce.js → page-6e91caf9bc0c8aba.js} +1 -1
  9. khoj/interface/compiled/_next/static/chunks/app/search/layout-94c76c3a41db42a2.js +1 -0
  10. khoj/interface/compiled/_next/static/chunks/app/search/{page-4885df3cd175c957.js → page-883b7d8d2e3abe3e.js} +1 -1
  11. khoj/interface/compiled/_next/static/chunks/app/settings/{page-8be3b35178abf2ec.js → page-95e994ddac31473f.js} +1 -1
  12. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-95998f0bdc22bb13.js +1 -0
  13. khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-4a4b0c0f4749c2b2.js → page-8c8c175f7f212b03.js} +1 -1
  14. khoj/interface/compiled/_next/static/chunks/{webpack-15412ee214acd999.js → webpack-4bf3eab7681a1206.js} +1 -1
  15. khoj/interface/compiled/_next/static/css/1e9b757ee2a2b34b.css +1 -0
  16. khoj/interface/compiled/_next/static/css/440ae0f0f650dc35.css +1 -0
  17. khoj/interface/compiled/_next/static/css/bd2071cad2ecf293.css +1 -0
  18. khoj/interface/compiled/_next/static/css/ee66643a6a5bf71c.css +1 -0
  19. khoj/interface/compiled/agents/index.html +2 -2
  20. khoj/interface/compiled/agents/index.txt +2 -2
  21. khoj/interface/compiled/automations/index.html +2 -2
  22. khoj/interface/compiled/automations/index.txt +3 -3
  23. khoj/interface/compiled/chat/index.html +2 -2
  24. khoj/interface/compiled/chat/index.txt +2 -2
  25. khoj/interface/compiled/index.html +2 -2
  26. khoj/interface/compiled/index.txt +2 -2
  27. khoj/interface/compiled/search/index.html +2 -2
  28. khoj/interface/compiled/search/index.txt +2 -2
  29. khoj/interface/compiled/settings/index.html +2 -2
  30. khoj/interface/compiled/settings/index.txt +4 -4
  31. khoj/interface/compiled/share/chat/index.html +2 -2
  32. khoj/interface/compiled/share/chat/index.txt +2 -2
  33. khoj/processor/conversation/anthropic/anthropic_chat.py +11 -2
  34. khoj/processor/conversation/anthropic/utils.py +90 -103
  35. khoj/processor/conversation/google/gemini_chat.py +4 -1
  36. khoj/processor/conversation/google/utils.py +80 -18
  37. khoj/processor/conversation/offline/chat_model.py +3 -3
  38. khoj/processor/conversation/openai/gpt.py +13 -38
  39. khoj/processor/conversation/openai/utils.py +113 -12
  40. khoj/processor/conversation/prompts.py +17 -35
  41. khoj/processor/conversation/utils.py +128 -57
  42. khoj/processor/operator/grounding_agent.py +1 -1
  43. khoj/processor/operator/operator_agent_binary.py +4 -3
  44. khoj/processor/tools/online_search.py +18 -0
  45. khoj/processor/tools/run_code.py +1 -1
  46. khoj/routers/api_chat.py +1 -1
  47. khoj/routers/helpers.py +293 -26
  48. khoj/routers/research.py +169 -155
  49. khoj/utils/helpers.py +284 -8
  50. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/METADATA +1 -1
  51. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/RECORD +62 -62
  52. khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +0 -1
  53. khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +0 -1
  54. khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +0 -1
  55. khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +0 -1
  56. khoj/interface/compiled/_next/static/css/37a73b87f02df402.css +0 -1
  57. khoj/interface/compiled/_next/static/css/76c658ee459140a9.css +0 -1
  58. khoj/interface/compiled/_next/static/css/821d0d60b0b6871d.css +0 -1
  59. khoj/interface/compiled/_next/static/css/e6da1287d41f5409.css +0 -1
  60. /khoj/interface/compiled/_next/static/chunks/{1327-1a9107b9a2a04a98.js → 1327-3b1a41af530fa8ee.js} +0 -0
  61. /khoj/interface/compiled/_next/static/chunks/{1915-5c6508f6ebb62a30.js → 1915-fbfe167c84ad60c5.js} +0 -0
  62. /khoj/interface/compiled/_next/static/chunks/{2117-080746c8e170c81a.js → 2117-e78b6902ad6f75ec.js} +0 -0
  63. /khoj/interface/compiled/_next/static/chunks/{2939-4af3fd24b8ffc9ad.js → 2939-4d4084c5b888b960.js} +0 -0
  64. /khoj/interface/compiled/_next/static/chunks/{4447-cd95608f8e93e711.js → 4447-d6cf93724d57e34b.js} +0 -0
  65. /khoj/interface/compiled/_next/static/chunks/{8667-50b03a89e82e0ba7.js → 8667-4b7790573b08c50d.js} +0 -0
  66. /khoj/interface/compiled/_next/static/{cJdFAXV3MR9BSimUwQ40G → w19FJJa9p2AFJB6DEektd}/_buildManifest.js +0 -0
  67. /khoj/interface/compiled/_next/static/{cJdFAXV3MR9BSimUwQ40G → w19FJJa9p2AFJB6DEektd}/_ssgManifest.js +0 -0
  68. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/WHEEL +0 -0
  69. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/entry_points.txt +0 -0
  70. {khoj-1.42.8.dev6.dist-info → khoj-1.42.9.dev16.dist-info}/licenses/LICENSE +0 -0
@@ -121,7 +121,7 @@ class BinaryOperatorAgent(OperatorAgent):
121
121
  # Construct input for visual reasoner history
122
122
  visual_reasoner_history = self._format_message_for_api(self.messages)
123
123
  try:
124
- natural_language_action = await send_message_to_model_wrapper(
124
+ raw_response = await send_message_to_model_wrapper(
125
125
  query=query_text,
126
126
  query_images=query_screenshot,
127
127
  system_message=reasoning_system_prompt,
@@ -129,6 +129,7 @@ class BinaryOperatorAgent(OperatorAgent):
129
129
  agent_chat_model=self.reasoning_model,
130
130
  tracer=self.tracer,
131
131
  )
132
+ natural_language_action = raw_response.text
132
133
 
133
134
  if not isinstance(natural_language_action, str) or not natural_language_action.strip():
134
135
  raise ValueError(f"Natural language action is empty or not a string. Got {natural_language_action}")
@@ -255,10 +256,10 @@ class BinaryOperatorAgent(OperatorAgent):
255
256
 
256
257
  # Append summary messages to history
257
258
  trigger_summary = AgentMessage(role="user", content=summarize_prompt)
258
- summary_message = AgentMessage(role="assistant", content=summary)
259
+ summary_message = AgentMessage(role="assistant", content=summary.text)
259
260
  self.messages.extend([trigger_summary, summary_message])
260
261
 
261
- return summary
262
+ return summary.text
262
263
 
263
264
  def _compile_response(self, response_content: str | List) -> str:
264
265
  """Compile response content into a string, handling OpenAI message structures."""
@@ -390,7 +390,25 @@ async def read_webpages(
390
390
  query_files=query_files,
391
391
  tracer=tracer,
392
392
  )
393
+ async for result in read_webpages_content(
394
+ query,
395
+ urls,
396
+ user,
397
+ send_status_func=send_status_func,
398
+ agent=agent,
399
+ tracer=tracer,
400
+ ):
401
+ yield result
402
+
393
403
 
404
+ async def read_webpages_content(
405
+ query: str,
406
+ urls: List[str],
407
+ user: KhojUser,
408
+ send_status_func: Optional[Callable] = None,
409
+ agent: Agent = None,
410
+ tracer: dict = {},
411
+ ):
394
412
  logger.info(f"Reading web pages at: {urls}")
395
413
  if send_status_func:
396
414
  webpage_links_str = "\n- " + "\n- ".join(list(urls))
@@ -161,7 +161,7 @@ async def generate_python_code(
161
161
  )
162
162
 
163
163
  # Extract python code wrapped in markdown code blocks from the response
164
- code_blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL)
164
+ code_blocks = re.findall(r"```(?:python)?\n(.*?)```", response.text, re.DOTALL)
165
165
 
166
166
  if not code_blocks:
167
167
  raise ValueError("No Python code blocks found in response")
khoj/routers/api_chat.py CHANGED
@@ -1390,7 +1390,7 @@ async def chat(
1390
1390
  continue
1391
1391
  if cancellation_event.is_set():
1392
1392
  break
1393
- message = item.response
1393
+ message = item.text
1394
1394
  full_response += message if message else ""
1395
1395
  if item.thought:
1396
1396
  async for result in send_event(ChatEvent.THOUGHT, item.thought):
khoj/routers/helpers.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import base64
3
3
  import concurrent.futures
4
+ import fnmatch
4
5
  import hashlib
5
6
  import json
6
7
  import logging
@@ -120,6 +121,7 @@ from khoj.utils.config import OfflineChatProcessorModel
120
121
  from khoj.utils.helpers import (
121
122
  LRU,
122
123
  ConversationCommand,
124
+ ToolDefinition,
123
125
  get_file_type,
124
126
  in_debug_mode,
125
127
  is_none_or_empty,
@@ -303,7 +305,7 @@ async def acreate_title_from_history(
303
305
  with timer("Chat actor: Generate title from conversation history", logger):
304
306
  response = await send_message_to_model_wrapper(title_generation_prompt, user=user)
305
307
 
306
- return response.strip()
308
+ return response.text.strip()
307
309
 
308
310
 
309
311
  async def acreate_title_from_query(query: str, user: KhojUser = None) -> str:
@@ -315,7 +317,7 @@ async def acreate_title_from_query(query: str, user: KhojUser = None) -> str:
315
317
  with timer("Chat actor: Generate title from query", logger):
316
318
  response = await send_message_to_model_wrapper(title_generation_prompt, user=user)
317
319
 
318
- return response.strip()
320
+ return response.text.strip()
319
321
 
320
322
 
321
323
  async def acheck_if_safe_prompt(system_prompt: str, user: KhojUser = None, lax: bool = False) -> Tuple[bool, str]:
@@ -339,7 +341,7 @@ async def acheck_if_safe_prompt(system_prompt: str, user: KhojUser = None, lax:
339
341
  safe_prompt_check, user=user, response_type="json_object", response_schema=SafetyCheck
340
342
  )
341
343
 
342
- response = response.strip()
344
+ response = response.text.strip()
343
345
  try:
344
346
  response = json.loads(clean_json(response))
345
347
  is_safe = str(response.get("safe", "true")).lower() == "true"
@@ -418,7 +420,7 @@ async def aget_data_sources_and_output_format(
418
420
  output: str
419
421
 
420
422
  with timer("Chat actor: Infer information sources to refer", logger):
421
- response = await send_message_to_model_wrapper(
423
+ raw_response = await send_message_to_model_wrapper(
422
424
  relevant_tools_prompt,
423
425
  response_type="json_object",
424
426
  response_schema=PickTools,
@@ -429,7 +431,7 @@ async def aget_data_sources_and_output_format(
429
431
  )
430
432
 
431
433
  try:
432
- response = clean_json(response)
434
+ response = clean_json(raw_response.text)
433
435
  response = json.loads(response)
434
436
 
435
437
  chosen_sources = [s.strip() for s in response.get("source", []) if s.strip()]
@@ -506,7 +508,7 @@ async def infer_webpage_urls(
506
508
  links: List[str] = Field(..., min_items=1, max_items=max_webpages)
507
509
 
508
510
  with timer("Chat actor: Infer webpage urls to read", logger):
509
- response = await send_message_to_model_wrapper(
511
+ raw_response = await send_message_to_model_wrapper(
510
512
  online_queries_prompt,
511
513
  query_images=query_images,
512
514
  response_type="json_object",
@@ -519,7 +521,7 @@ async def infer_webpage_urls(
519
521
 
520
522
  # Validate that the response is a non-empty, JSON-serializable list of URLs
521
523
  try:
522
- response = clean_json(response)
524
+ response = clean_json(raw_response.text)
523
525
  urls = json.loads(response)
524
526
  valid_unique_urls = {str(url).strip() for url in urls["links"] if is_valid_url(url)}
525
527
  if is_none_or_empty(valid_unique_urls):
@@ -571,7 +573,7 @@ async def generate_online_subqueries(
571
573
  queries: List[str] = Field(..., min_items=1, max_items=max_queries)
572
574
 
573
575
  with timer("Chat actor: Generate online search subqueries", logger):
574
- response = await send_message_to_model_wrapper(
576
+ raw_response = await send_message_to_model_wrapper(
575
577
  online_queries_prompt,
576
578
  query_images=query_images,
577
579
  response_type="json_object",
@@ -584,7 +586,7 @@ async def generate_online_subqueries(
584
586
 
585
587
  # Validate that the response is a non-empty, JSON-serializable list
586
588
  try:
587
- response = clean_json(response)
589
+ response = clean_json(raw_response.text)
588
590
  response = pyjson5.loads(response)
589
591
  response = {q.strip() for q in response["queries"] if q.strip()}
590
592
  if not isinstance(response, set) or not response or len(response) == 0:
@@ -645,7 +647,7 @@ async def aschedule_query(
645
647
 
646
648
  # Validate that the response is a non-empty, JSON-serializable list
647
649
  try:
648
- raw_response = raw_response.strip()
650
+ raw_response = raw_response.text.strip()
649
651
  response: Dict[str, str] = json.loads(clean_json(raw_response))
650
652
  if not response or not isinstance(response, Dict) or len(response) != 3:
651
653
  raise AssertionError(f"Invalid response for scheduling query : {response}")
@@ -683,7 +685,7 @@ async def extract_relevant_info(
683
685
  agent_chat_model=agent_chat_model,
684
686
  tracer=tracer,
685
687
  )
686
- return response.strip()
688
+ return response.text.strip()
687
689
 
688
690
 
689
691
  async def extract_relevant_summary(
@@ -726,7 +728,7 @@ async def extract_relevant_summary(
726
728
  agent_chat_model=agent_chat_model,
727
729
  tracer=tracer,
728
730
  )
729
- return response.strip()
731
+ return response.text.strip()
730
732
 
731
733
 
732
734
  async def generate_summary_from_files(
@@ -897,7 +899,7 @@ async def generate_better_diagram_description(
897
899
  agent_chat_model=agent_chat_model,
898
900
  tracer=tracer,
899
901
  )
900
- response = response.strip()
902
+ response = response.text.strip()
901
903
  if response.startswith(('"', "'")) and response.endswith(('"', "'")):
902
904
  response = response[1:-1]
903
905
 
@@ -925,10 +927,10 @@ async def generate_excalidraw_diagram_from_description(
925
927
  raw_response = await send_message_to_model_wrapper(
926
928
  query=excalidraw_diagram_generation, user=user, agent_chat_model=agent_chat_model, tracer=tracer
927
929
  )
928
- raw_response = clean_json(raw_response)
930
+ raw_response_text = clean_json(raw_response.text)
929
931
  try:
930
932
  # Expect response to have `elements` and `scratchpad` keys
931
- response: Dict[str, str] = json.loads(raw_response)
933
+ response: Dict[str, str] = json.loads(raw_response_text)
932
934
  if (
933
935
  not response
934
936
  or not isinstance(response, Dict)
@@ -937,7 +939,7 @@ async def generate_excalidraw_diagram_from_description(
937
939
  ):
938
940
  raise AssertionError(f"Invalid response for generating Excalidraw diagram: {response}")
939
941
  except Exception:
940
- raise AssertionError(f"Invalid response for generating Excalidraw diagram: {raw_response}")
942
+ raise AssertionError(f"Invalid response for generating Excalidraw diagram: {raw_response_text}")
941
943
  if not response or not isinstance(response["elements"], List) or not isinstance(response["elements"][0], Dict):
942
944
  # TODO Some additional validation here that it's a valid Excalidraw diagram
943
945
  raise AssertionError(f"Invalid response for improving diagram description: {response}")
@@ -1048,11 +1050,11 @@ async def generate_better_mermaidjs_diagram_description(
1048
1050
  agent_chat_model=agent_chat_model,
1049
1051
  tracer=tracer,
1050
1052
  )
1051
- response = response.strip()
1052
- if response.startswith(('"', "'")) and response.endswith(('"', "'")):
1053
- response = response[1:-1]
1053
+ response_text = response.text.strip()
1054
+ if response_text.startswith(('"', "'")) and response_text.endswith(('"', "'")):
1055
+ response_text = response_text[1:-1]
1054
1056
 
1055
- return response
1057
+ return response_text
1056
1058
 
1057
1059
 
1058
1060
  async def generate_mermaidjs_diagram_from_description(
@@ -1076,7 +1078,7 @@ async def generate_mermaidjs_diagram_from_description(
1076
1078
  raw_response = await send_message_to_model_wrapper(
1077
1079
  query=mermaidjs_diagram_generation, user=user, agent_chat_model=agent_chat_model, tracer=tracer
1078
1080
  )
1079
- return clean_mermaidjs(raw_response.strip())
1081
+ return clean_mermaidjs(raw_response.text.strip())
1080
1082
 
1081
1083
 
1082
1084
  async def generate_better_image_prompt(
@@ -1151,11 +1153,11 @@ async def generate_better_image_prompt(
1151
1153
  agent_chat_model=agent_chat_model,
1152
1154
  tracer=tracer,
1153
1155
  )
1154
- response = response.strip()
1155
- if response.startswith(('"', "'")) and response.endswith(('"', "'")):
1156
- response = response[1:-1]
1156
+ response_text = response.text.strip()
1157
+ if response_text.startswith(('"', "'")) and response_text.endswith(('"', "'")):
1158
+ response_text = response_text[1:-1]
1157
1159
 
1158
- return response
1160
+ return response_text
1159
1161
 
1160
1162
 
1161
1163
  async def search_documents(
@@ -1329,7 +1331,7 @@ async def extract_questions(
1329
1331
 
1330
1332
  # Extract questions from the response
1331
1333
  try:
1332
- response = clean_json(raw_response)
1334
+ response = clean_json(raw_response.text)
1333
1335
  response = pyjson5.loads(response)
1334
1336
  queries = [q.strip() for q in response["queries"] if q.strip()]
1335
1337
  if not isinstance(queries, list) or not queries:
@@ -1439,6 +1441,7 @@ async def send_message_to_model_wrapper(
1439
1441
  system_message: str = "",
1440
1442
  response_type: str = "text",
1441
1443
  response_schema: BaseModel = None,
1444
+ tools: List[ToolDefinition] = None,
1442
1445
  deepthought: bool = False,
1443
1446
  user: KhojUser = None,
1444
1447
  query_images: List[str] = None,
@@ -1506,6 +1509,7 @@ async def send_message_to_model_wrapper(
1506
1509
  model=chat_model_name,
1507
1510
  response_type=response_type,
1508
1511
  response_schema=response_schema,
1512
+ tools=tools,
1509
1513
  deepthought=deepthought,
1510
1514
  api_base_url=api_base_url,
1511
1515
  tracer=tracer,
@@ -1517,6 +1521,7 @@ async def send_message_to_model_wrapper(
1517
1521
  model=chat_model_name,
1518
1522
  response_type=response_type,
1519
1523
  response_schema=response_schema,
1524
+ tools=tools,
1520
1525
  deepthought=deepthought,
1521
1526
  api_base_url=api_base_url,
1522
1527
  tracer=tracer,
@@ -1528,6 +1533,7 @@ async def send_message_to_model_wrapper(
1528
1533
  model=chat_model_name,
1529
1534
  response_type=response_type,
1530
1535
  response_schema=response_schema,
1536
+ tools=tools,
1531
1537
  deepthought=deepthought,
1532
1538
  api_base_url=api_base_url,
1533
1539
  tracer=tracer,
@@ -2796,3 +2802,264 @@ def get_notion_auth_url(user: KhojUser):
2796
2802
  if not NOTION_OAUTH_CLIENT_ID or not NOTION_OAUTH_CLIENT_SECRET or not NOTION_REDIRECT_URI:
2797
2803
  return None
2798
2804
  return f"https://api.notion.com/v1/oauth/authorize?client_id={NOTION_OAUTH_CLIENT_ID}&redirect_uri={NOTION_REDIRECT_URI}&response_type=code&state={user.uuid}"
2805
+
2806
+
2807
+ async def view_file_content(
2808
+ path: str,
2809
+ start_line: Optional[int] = None,
2810
+ end_line: Optional[int] = None,
2811
+ user: KhojUser = None,
2812
+ ):
2813
+ """
2814
+ View the contents of a file from the user's document database with optional line range specification.
2815
+ """
2816
+ query = f"View file: {path}"
2817
+ if start_line and end_line:
2818
+ query += f" (lines {start_line}-{end_line})"
2819
+
2820
+ try:
2821
+ # Get the file object from the database by name
2822
+ file_objects = await FileObjectAdapters.aget_file_objects_by_name(user, path)
2823
+
2824
+ if not file_objects:
2825
+ error_msg = f"File '{path}' not found in user documents"
2826
+ logger.warning(error_msg)
2827
+ yield [{"query": query, "file": path, "compiled": error_msg}]
2828
+ return
2829
+
2830
+ # Use the first file object if multiple exist
2831
+ file_object = file_objects[0]
2832
+ raw_text = file_object.raw_text
2833
+
2834
+ # Apply line range filtering if specified
2835
+ if start_line is None and end_line is None:
2836
+ filtered_text = raw_text
2837
+ else:
2838
+ lines = raw_text.split("\n")
2839
+ start_line = start_line or 1
2840
+ end_line = end_line or len(lines)
2841
+
2842
+ # Validate line range
2843
+ if start_line < 1 or end_line < 1 or start_line > end_line:
2844
+ error_msg = f"Invalid line range: {start_line}-{end_line}"
2845
+ logger.warning(error_msg)
2846
+ yield [{"query": query, "file": path, "compiled": error_msg}]
2847
+ return
2848
+ if start_line > len(lines):
2849
+ error_msg = f"Start line {start_line} exceeds total number of lines {len(lines)}"
2850
+ logger.warning(error_msg)
2851
+ yield [{"query": query, "file": path, "compiled": error_msg}]
2852
+ return
2853
+
2854
+ # Convert from 1-based to 0-based indexing and ensure bounds
2855
+ start_idx = max(0, start_line - 1)
2856
+ end_idx = min(len(lines), end_line)
2857
+
2858
+ selected_lines = lines[start_idx:end_idx]
2859
+ filtered_text = "\n".join(selected_lines)
2860
+
2861
+ # Truncate the text if it's too long
2862
+ if len(filtered_text) > 10000:
2863
+ filtered_text = filtered_text[:10000] + "\n\n[Truncated. Use line numbers to view specific sections.]"
2864
+
2865
+ # Format the result as a document reference
2866
+ document_results = [
2867
+ {
2868
+ "query": query,
2869
+ "file": path,
2870
+ "compiled": filtered_text,
2871
+ }
2872
+ ]
2873
+
2874
+ yield document_results
2875
+
2876
+ except Exception as e:
2877
+ error_msg = f"Error viewing file {path}: {str(e)}"
2878
+ logger.error(error_msg, exc_info=True)
2879
+
2880
+ # Return an error result in the expected format
2881
+ yield [{"query": query, "file": path, "compiled": error_msg}]
2882
+
2883
+
2884
+ async def grep_files(
2885
+ regex_pattern: str,
2886
+ path_prefix: Optional[str] = None,
2887
+ lines_before: Optional[int] = None,
2888
+ lines_after: Optional[int] = None,
2889
+ user: KhojUser = None,
2890
+ ):
2891
+ """
2892
+ Search for a regex pattern in files with an optional path prefix and context lines.
2893
+ """
2894
+
2895
+ # Construct the query string based on provided parameters
2896
+ def _generate_query(line_count, doc_count, path, pattern, lines_before, lines_after, max_results=1000):
2897
+ query = f"**Found {line_count} matches for '{pattern}' in {doc_count} documents**"
2898
+ if path:
2899
+ query += f" in {path}"
2900
+ if lines_before or lines_after or line_count > max_results:
2901
+ query += " Showing"
2902
+ if lines_before or lines_after:
2903
+ context_info = []
2904
+ if lines_before:
2905
+ context_info.append(f"{lines_before} lines before")
2906
+ if lines_after:
2907
+ context_info.append(f"{lines_after} lines after")
2908
+ query += f" {' and '.join(context_info)}"
2909
+ if line_count > max_results:
2910
+ if lines_before or lines_after:
2911
+ query += f" for"
2912
+ query += f" first {max_results} results"
2913
+ return query
2914
+
2915
+ # Validate regex pattern
2916
+ path_prefix = path_prefix or ""
2917
+ lines_before = lines_before or 0
2918
+ lines_after = lines_after or 0
2919
+
2920
+ try:
2921
+ regex = re.compile(regex_pattern, re.IGNORECASE)
2922
+ except re.error as e:
2923
+ yield {
2924
+ "query": _generate_query(0, 0, path_prefix, regex_pattern, lines_before, lines_after),
2925
+ "file": path_prefix,
2926
+ "compiled": f"Invalid regex pattern: {e}",
2927
+ }
2928
+ return
2929
+
2930
+ try:
2931
+ file_matches = await FileObjectAdapters.aget_file_objects_by_regex(user, regex_pattern, path_prefix)
2932
+
2933
+ line_matches = []
2934
+ for file_object in file_matches:
2935
+ lines = file_object.raw_text.split("\n")
2936
+ matched_line_numbers = []
2937
+
2938
+ # Find all matching line numbers first
2939
+ for i, line in enumerate(lines, 1):
2940
+ if regex.search(line):
2941
+ matched_line_numbers.append(i)
2942
+
2943
+ # Build context for each match
2944
+ for line_num in matched_line_numbers:
2945
+ context_lines = []
2946
+
2947
+ # Calculate start and end indices for context (0-based)
2948
+ start_idx = max(0, line_num - 1 - lines_before)
2949
+ end_idx = min(len(lines), line_num + lines_after)
2950
+
2951
+ # Add context lines with line numbers
2952
+ for idx in range(start_idx, end_idx):
2953
+ current_line_num = idx + 1
2954
+ line_content = lines[idx]
2955
+
2956
+ if current_line_num == line_num:
2957
+ # This is the matching line, mark it
2958
+ context_lines.append(f"{file_object.file_name}:{current_line_num}:> {line_content}")
2959
+ else:
2960
+ # This is a context line
2961
+ context_lines.append(f"{file_object.file_name}:{current_line_num}: {line_content}")
2962
+
2963
+ # Add separator between matches if showing context
2964
+ if lines_before > 0 or lines_after > 0:
2965
+ context_lines.append("--")
2966
+
2967
+ line_matches.extend(context_lines)
2968
+
2969
+ # Remove the last separator if it exists
2970
+ if line_matches and line_matches[-1] == "--":
2971
+ line_matches.pop()
2972
+
2973
+ # Check if no results found
2974
+ max_results = 1000
2975
+ query = _generate_query(
2976
+ len([m for m in line_matches if ":>" in m]),
2977
+ len(file_matches),
2978
+ path_prefix,
2979
+ regex_pattern,
2980
+ lines_before,
2981
+ lines_after,
2982
+ max_results,
2983
+ )
2984
+ if not line_matches:
2985
+ yield {"query": query, "file": path_prefix, "compiled": "No matches found."}
2986
+ return
2987
+
2988
+ # Truncate matched lines list if too long
2989
+ if len(line_matches) > max_results:
2990
+ line_matches = line_matches[:max_results] + [
2991
+ f"... {len(line_matches) - max_results} more results found. Use stricter regex or path to narrow down results."
2992
+ ]
2993
+
2994
+ yield {"query": query, "file": path_prefix or "", "compiled": "\n".join(line_matches)}
2995
+
2996
+ except Exception as e:
2997
+ error_msg = f"Error using grep files tool: {str(e)}"
2998
+ logger.error(error_msg, exc_info=True)
2999
+ yield [
3000
+ {
3001
+ "query": _generate_query(0, 0, path_prefix or "", regex_pattern, lines_before, lines_after),
3002
+ "file": path_prefix,
3003
+ "compiled": error_msg,
3004
+ }
3005
+ ]
3006
+
3007
+
3008
+ async def list_files(
3009
+ path: Optional[str] = None,
3010
+ pattern: Optional[str] = None,
3011
+ user: KhojUser = None,
3012
+ ):
3013
+ """
3014
+ List files under a given path or glob pattern from the user's document database.
3015
+ """
3016
+
3017
+ # Construct the query string based on provided parameters
3018
+ def _generate_query(doc_count, path, pattern):
3019
+ query = f"**Found {doc_count} files**"
3020
+ if path:
3021
+ query += f" in {path}"
3022
+ if pattern:
3023
+ query += f" filtered by {pattern}"
3024
+ return query
3025
+
3026
+ try:
3027
+ # Get user files by path prefix when specified
3028
+ path = path or ""
3029
+ if path in ["", "/", ".", "./", "~", "~/"]:
3030
+ file_objects = await FileObjectAdapters.aget_all_file_objects(user, limit=10000)
3031
+ else:
3032
+ file_objects = await FileObjectAdapters.aget_file_objects_by_path_prefix(user, path)
3033
+
3034
+ if not file_objects:
3035
+ yield {"query": _generate_query(0, path, pattern), "file": path, "compiled": "No files found."}
3036
+ return
3037
+
3038
+ # Extract file names from file objects
3039
+ files = [f.file_name for f in file_objects]
3040
+ # Convert to relative file path (similar to ls)
3041
+ if path:
3042
+ files = [f[len(path) :] for f in files]
3043
+
3044
+ # Apply glob pattern filtering if specified
3045
+ if pattern:
3046
+ files = [f for f in files if fnmatch.fnmatch(f, pattern)]
3047
+
3048
+ query = _generate_query(len(files), path, pattern)
3049
+ if not files:
3050
+ yield {"query": query, "file": path, "compiled": "No files found."}
3051
+ return
3052
+
3053
+ # Truncate the list if it's too long
3054
+ max_files = 100
3055
+ if len(files) > max_files:
3056
+ files = files[:max_files] + [
3057
+ f"... {len(files) - max_files} more files found. Use glob pattern to narrow down results."
3058
+ ]
3059
+
3060
+ yield {"query": query, "file": path, "compiled": "\n- ".join(files)}
3061
+
3062
+ except Exception as e:
3063
+ error_msg = f"Error listing files in {path}: {str(e)}"
3064
+ logger.error(error_msg, exc_info=True)
3065
+ yield {"query": query, "file": path, "compiled": error_msg}