local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. local_deep_research/__init__.py +23 -22
  2. local_deep_research/__main__.py +16 -0
  3. local_deep_research/advanced_search_system/__init__.py +7 -0
  4. local_deep_research/advanced_search_system/filters/__init__.py +8 -0
  5. local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
  6. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
  7. local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
  8. local_deep_research/advanced_search_system/findings/repository.py +452 -0
  9. local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
  10. local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
  11. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
  12. local_deep_research/advanced_search_system/questions/__init__.py +1 -0
  13. local_deep_research/advanced_search_system/questions/base_question.py +64 -0
  14. local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
  15. local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
  16. local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
  17. local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
  18. local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
  19. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
  20. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
  21. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
  22. local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
  23. local_deep_research/advanced_search_system/tools/__init__.py +1 -0
  24. local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
  25. local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
  26. local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
  27. local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
  28. local_deep_research/api/__init__.py +5 -5
  29. local_deep_research/api/research_functions.py +154 -160
  30. local_deep_research/app.py +8 -0
  31. local_deep_research/citation_handler.py +25 -16
  32. local_deep_research/{config.py → config/config_files.py} +102 -110
  33. local_deep_research/config/llm_config.py +472 -0
  34. local_deep_research/config/search_config.py +77 -0
  35. local_deep_research/defaults/__init__.py +10 -5
  36. local_deep_research/defaults/main.toml +2 -2
  37. local_deep_research/defaults/search_engines.toml +60 -34
  38. local_deep_research/main.py +121 -19
  39. local_deep_research/migrate_db.py +147 -0
  40. local_deep_research/report_generator.py +87 -45
  41. local_deep_research/search_system.py +153 -283
  42. local_deep_research/setup_data_dir.py +35 -0
  43. local_deep_research/test_migration.py +178 -0
  44. local_deep_research/utilities/__init__.py +0 -0
  45. local_deep_research/utilities/db_utils.py +49 -0
  46. local_deep_research/{utilties → utilities}/enums.py +2 -2
  47. local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
  48. local_deep_research/utilities/search_utilities.py +242 -0
  49. local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
  50. local_deep_research/web/__init__.py +0 -1
  51. local_deep_research/web/app.py +86 -1709
  52. local_deep_research/web/app_factory.py +289 -0
  53. local_deep_research/web/database/README.md +70 -0
  54. local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
  55. local_deep_research/web/database/migrations.py +447 -0
  56. local_deep_research/web/database/models.py +117 -0
  57. local_deep_research/web/database/schema_upgrade.py +107 -0
  58. local_deep_research/web/models/database.py +294 -0
  59. local_deep_research/web/models/settings.py +94 -0
  60. local_deep_research/web/routes/api_routes.py +559 -0
  61. local_deep_research/web/routes/history_routes.py +354 -0
  62. local_deep_research/web/routes/research_routes.py +715 -0
  63. local_deep_research/web/routes/settings_routes.py +1583 -0
  64. local_deep_research/web/services/research_service.py +947 -0
  65. local_deep_research/web/services/resource_service.py +149 -0
  66. local_deep_research/web/services/settings_manager.py +669 -0
  67. local_deep_research/web/services/settings_service.py +187 -0
  68. local_deep_research/web/services/socket_service.py +210 -0
  69. local_deep_research/web/static/css/custom_dropdown.css +277 -0
  70. local_deep_research/web/static/css/settings.css +1223 -0
  71. local_deep_research/web/static/css/styles.css +525 -48
  72. local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
  73. local_deep_research/web/static/js/components/detail.js +348 -0
  74. local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
  75. local_deep_research/web/static/js/components/fallback/ui.js +215 -0
  76. local_deep_research/web/static/js/components/history.js +487 -0
  77. local_deep_research/web/static/js/components/logpanel.js +949 -0
  78. local_deep_research/web/static/js/components/progress.js +1107 -0
  79. local_deep_research/web/static/js/components/research.js +1865 -0
  80. local_deep_research/web/static/js/components/results.js +766 -0
  81. local_deep_research/web/static/js/components/settings.js +3981 -0
  82. local_deep_research/web/static/js/components/settings_sync.js +106 -0
  83. local_deep_research/web/static/js/main.js +226 -0
  84. local_deep_research/web/static/js/services/api.js +253 -0
  85. local_deep_research/web/static/js/services/audio.js +31 -0
  86. local_deep_research/web/static/js/services/formatting.js +119 -0
  87. local_deep_research/web/static/js/services/pdf.js +622 -0
  88. local_deep_research/web/static/js/services/socket.js +882 -0
  89. local_deep_research/web/static/js/services/ui.js +546 -0
  90. local_deep_research/web/templates/base.html +72 -0
  91. local_deep_research/web/templates/components/custom_dropdown.html +47 -0
  92. local_deep_research/web/templates/components/log_panel.html +32 -0
  93. local_deep_research/web/templates/components/mobile_nav.html +22 -0
  94. local_deep_research/web/templates/components/settings_form.html +299 -0
  95. local_deep_research/web/templates/components/sidebar.html +21 -0
  96. local_deep_research/web/templates/pages/details.html +73 -0
  97. local_deep_research/web/templates/pages/history.html +51 -0
  98. local_deep_research/web/templates/pages/progress.html +57 -0
  99. local_deep_research/web/templates/pages/research.html +139 -0
  100. local_deep_research/web/templates/pages/results.html +59 -0
  101. local_deep_research/web/templates/settings_dashboard.html +78 -192
  102. local_deep_research/web/utils/__init__.py +0 -0
  103. local_deep_research/web/utils/formatters.py +76 -0
  104. local_deep_research/web_search_engines/engines/full_search.py +18 -16
  105. local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
  106. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
  107. local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
  108. local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
  109. local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
  110. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
  111. local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
  112. local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
  113. local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
  114. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
  115. local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
  116. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
  117. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
  118. local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
  119. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
  120. local_deep_research/web_search_engines/search_engine_base.py +174 -99
  121. local_deep_research/web_search_engines/search_engine_factory.py +192 -102
  122. local_deep_research/web_search_engines/search_engines_config.py +22 -15
  123. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
  124. local_deep_research-0.2.2.dist-info/RECORD +135 -0
  125. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
  126. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
  127. local_deep_research/defaults/llm_config.py +0 -338
  128. local_deep_research/utilties/search_utilities.py +0 -114
  129. local_deep_research/web/static/js/app.js +0 -3763
  130. local_deep_research/web/templates/api_keys_config.html +0 -82
  131. local_deep_research/web/templates/collections_config.html +0 -90
  132. local_deep_research/web/templates/index.html +0 -348
  133. local_deep_research/web/templates/llm_config.html +0 -120
  134. local_deep_research/web/templates/main_config.html +0 -89
  135. local_deep_research/web/templates/search_engines_config.html +0 -154
  136. local_deep_research/web/templates/settings.html +0 -519
  137. local_deep_research-0.1.26.dist-info/RECORD +0 -61
  138. local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
  139. /local_deep_research/{utilties → config}/__init__.py +0 -0
  140. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,46 +1,107 @@
1
- from typing import Dict, List, Any, Optional, Tuple, Union
2
- import os
3
- import json
4
1
  import hashlib
2
+ import json
3
+ import logging
4
+ import os
5
5
  import time
6
+ import uuid
7
+ from concurrent.futures import ProcessPoolExecutor
6
8
  from datetime import datetime
7
9
  from pathlib import Path
8
- import tiktoken
9
- import logging
10
- import re
11
- import pickle
10
+ from typing import Any, Dict, Iterable, List, Optional
12
11
 
13
- from faiss import normalize_L2
14
- from langchain_core.language_models import BaseLLM
12
+ from faiss import IndexFlatL2
13
+ from langchain_community.docstore.in_memory import InMemoryDocstore
15
14
  from langchain_community.document_loaders import (
16
- PyPDFLoader,
17
- TextLoader,
18
- UnstructuredMarkdownLoader,
19
- UnstructuredWordDocumentLoader,
20
15
  CSVLoader,
16
+ PyPDFLoader,
17
+ TextLoader,
21
18
  UnstructuredExcelLoader,
22
- DirectoryLoader
19
+ UnstructuredMarkdownLoader,
20
+ UnstructuredWordDocumentLoader,
23
21
  )
24
22
  from langchain_community.document_loaders.base import BaseLoader
25
- from langchain_text_splitters import RecursiveCharacterTextSplitter
26
- from langchain_community.vectorstores import FAISS
27
- from langchain_community.vectorstores.utils import DistanceStrategy
28
23
  from langchain_community.embeddings import (
29
24
  HuggingFaceEmbeddings,
30
25
  OllamaEmbeddings,
31
- SentenceTransformerEmbeddings
26
+ SentenceTransformerEmbeddings,
32
27
  )
28
+ from langchain_community.vectorstores import FAISS
29
+ from langchain_core.documents import Document
30
+ from langchain_core.language_models import BaseLLM
31
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
33
32
 
34
- from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
35
- from local_deep_research import config
33
+ from ...config import search_config
34
+ from ..search_engine_base import BaseSearchEngine
36
35
 
37
36
  # Setup logging
38
37
  logging.basicConfig(level=logging.INFO)
39
38
  logger = logging.getLogger(__name__)
40
39
 
40
+
41
+ def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
42
+ """Get an appropriate document loader for a file based on its extension"""
43
+ file_path = Path(file_path)
44
+ extension = file_path.suffix.lower()
45
+
46
+ try:
47
+ if extension == ".pdf":
48
+ return PyPDFLoader(str(file_path))
49
+ elif extension == ".txt":
50
+ return TextLoader(str(file_path))
51
+ elif extension in [".md", ".markdown"]:
52
+ return UnstructuredMarkdownLoader(str(file_path))
53
+ elif extension in [".doc", ".docx"]:
54
+ return UnstructuredWordDocumentLoader(str(file_path))
55
+ elif extension == ".csv":
56
+ return CSVLoader(str(file_path))
57
+ elif extension in [".xls", ".xlsx"]:
58
+ return UnstructuredExcelLoader(str(file_path))
59
+ else:
60
+ # Try the text loader as a fallback for unknown extensions
61
+ logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
62
+ return TextLoader(str(file_path), encoding="utf-8")
63
+ except Exception as e:
64
+ logger.error(f"Error creating loader for {file_path}: {e}")
65
+ return None
66
+
67
+
68
+ def _load_document(file_path: Path) -> List[Document]:
69
+ """
70
+ Loads documents from a file.
71
+
72
+ Args:
73
+ file_path: The path to the document to load.
74
+
75
+ Returns:
76
+ The loaded documents, or an empty list if it failed to load.
77
+
78
+ """
79
+ # Get a loader for this file
80
+ loader = _get_file_loader(str(file_path))
81
+
82
+ if loader is None:
83
+ # No loader for this filetype.
84
+ return []
85
+
86
+ try:
87
+ # Load the document
88
+ docs = loader.load()
89
+
90
+ # Add source path metadata and ID.
91
+ for doc in docs:
92
+ doc.metadata["source"] = str(file_path)
93
+ doc.metadata["filename"] = file_path.name
94
+
95
+ except Exception as e:
96
+ logger.error(f"Error loading {file_path}: {e}")
97
+ return []
98
+
99
+ return docs
100
+
101
+
41
102
  class LocalEmbeddingManager:
42
103
  """Handles embedding generation and storage for local document search"""
43
-
104
+
44
105
  def __init__(
45
106
  self,
46
107
  embedding_model: str = "all-MiniLM-L6-v2",
@@ -53,7 +114,7 @@ class LocalEmbeddingManager:
53
114
  ):
54
115
  """
55
116
  Initialize the embedding manager for local document search.
56
-
117
+
57
118
  Args:
58
119
  embedding_model: Name of the embedding model to use
59
120
  embedding_device: Device to run embeddings on ('cpu' or 'cuda')
@@ -63,7 +124,7 @@ class LocalEmbeddingManager:
63
124
  chunk_overlap: Overlap between chunks
64
125
  cache_dir: Directory to store embedding cache and index
65
126
  """
66
-
127
+
67
128
  self.embedding_model = embedding_model
68
129
  self.embedding_device = embedding_device
69
130
  self.embedding_model_type = embedding_model_type
@@ -71,29 +132,29 @@ class LocalEmbeddingManager:
71
132
  self.chunk_size = chunk_size
72
133
  self.chunk_overlap = chunk_overlap
73
134
  self.cache_dir = Path(cache_dir)
74
-
135
+
75
136
  # Create cache directory if it doesn't exist
76
137
  self.cache_dir.mkdir(parents=True, exist_ok=True)
77
-
138
+
78
139
  # Initialize the embedding model
79
140
  self._embeddings = None
80
-
141
+
81
142
  # Initialize the text splitter
82
143
  self.text_splitter = RecursiveCharacterTextSplitter(
83
- chunk_size=chunk_size,
84
- chunk_overlap=chunk_overlap
144
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
85
145
  )
86
-
146
+
87
147
  # Track indexed folders and their metadata
88
148
  self.indexed_folders = self._load_indexed_folders()
89
-
149
+
90
150
  # Vector store cache
91
151
  self.vector_stores = {}
152
+
92
153
  @property
93
154
  def embeddings(self):
94
155
  """
95
156
  Lazily initialize embeddings when first accessed.
96
- This allows the LocalEmbeddingManager to be created without
157
+ This allows the LocalEmbeddingManager to be created without
97
158
  immediately loading models, which is helpful when no local search is performed.
98
159
  """
99
160
  if self._embeddings is None:
@@ -107,30 +168,38 @@ class LocalEmbeddingManager:
107
168
  if self.embedding_model_type == "ollama":
108
169
  # Use Ollama for embeddings
109
170
  if not self.ollama_base_url:
110
- self.ollama_base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
111
-
112
- logger.info(f"Initializing Ollama embeddings with model {self.embedding_model}")
171
+ self.ollama_base_url = os.getenv(
172
+ "OLLAMA_BASE_URL", "http://localhost:11434"
173
+ )
174
+
175
+ logger.info(
176
+ f"Initializing Ollama embeddings with model {self.embedding_model}"
177
+ )
113
178
  return OllamaEmbeddings(
114
- model=self.embedding_model,
115
- base_url=self.ollama_base_url
179
+ model=self.embedding_model, base_url=self.ollama_base_url
116
180
  )
117
181
  else:
118
182
  # Default: Use SentenceTransformers/HuggingFace
119
- logger.info(f"Initializing SentenceTransformerEmbeddings with model {self.embedding_model}")
183
+ logger.info(
184
+ f"Initializing SentenceTransformerEmbeddings with model {self.embedding_model}"
185
+ )
120
186
  return SentenceTransformerEmbeddings(
121
187
  model_name=self.embedding_model,
122
- model_kwargs={"device": self.embedding_device}
188
+ model_kwargs={"device": self.embedding_device},
123
189
  )
124
190
  except Exception as e:
125
191
  logger.error(f"Error initializing embeddings: {e}")
126
- logger.warning("Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2")
192
+ logger.warning(
193
+ "Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2"
194
+ )
127
195
  return HuggingFaceEmbeddings(
128
196
  model_name="sentence-transformers/all-MiniLM-L6-v2"
129
197
  )
198
+
130
199
  def _load_or_create_vector_store(self):
131
200
  """Load the vector store from disk or create it if needed"""
132
201
  vector_store_path = self._get_vector_store_path()
133
-
202
+
134
203
  # Check if vector store exists and is up to date
135
204
  if vector_store_path.exists() and not self._check_folders_modified():
136
205
  logger.info(f"Loading existing vector store from {vector_store_path}")
@@ -139,86 +208,121 @@ class LocalEmbeddingManager:
139
208
  str(vector_store_path),
140
209
  self.embeddings,
141
210
  allow_dangerous_deserialization=True,
142
- normalize_L2=True
211
+ normalize_L2=True,
143
212
  )
144
-
213
+
145
214
  # Add this code to show document count
146
215
  doc_count = len(vector_store.index_to_docstore_id)
147
216
  logger.info(f"Loaded index with {doc_count} document chunks")
148
-
217
+
149
218
  return vector_store
150
219
  except Exception as e:
151
220
  logger.error(f"Error loading vector store: {e}")
152
221
  logger.info("Will create a new vector store")
153
-
222
+
154
223
  # Create a new vector store
155
- return self._create_vector_store()
224
+ return self._create_vector_store()
225
+
156
226
  def _load_indexed_folders(self) -> Dict[str, Dict[str, Any]]:
157
227
  """Load metadata about indexed folders from disk"""
158
228
  index_metadata_path = self.cache_dir / "index_metadata.json"
159
-
229
+
160
230
  if index_metadata_path.exists():
161
231
  try:
162
232
  with open(index_metadata_path, "r") as f:
163
233
  return json.load(f)
164
234
  except Exception as e:
165
235
  logger.error(f"Error loading index metadata: {e}")
166
-
236
+
167
237
  return {}
168
-
238
+
169
239
  def _save_indexed_folders(self):
170
240
  """Save metadata about indexed folders to disk"""
171
241
  index_metadata_path = self.cache_dir / "index_metadata.json"
172
-
242
+
173
243
  try:
174
244
  with open(index_metadata_path, "w") as f:
175
245
  json.dump(self.indexed_folders, f, indent=2)
176
246
  except Exception as e:
177
247
  logger.error(f"Error saving index metadata: {e}")
178
-
179
- def _get_folder_hash(self, folder_path: str) -> str:
248
+
249
+ @staticmethod
250
+ def get_folder_hash(folder_path: Path) -> str:
180
251
  """Generate a hash for a folder based on its path"""
181
- # Strip trailing slashes if we have them.
182
- if folder_path.endswith("/"):
183
- folder_path = folder_path[:-1]
252
+ # Canonicalize the path so we don't have weird Windows vs. Linux
253
+ # problems or issues with trailing slashes.
254
+ canonical_folder_path = "/".join(folder_path.parts)
255
+ return hashlib.md5(canonical_folder_path.encode()).hexdigest()
184
256
 
185
- return hashlib.md5(folder_path.encode()).hexdigest()
186
-
187
- def _get_index_path(self, folder_path: str) -> Path:
257
+ def _get_index_path(self, folder_path: Path) -> Path:
188
258
  """Get the path where the index for a specific folder should be stored"""
189
- folder_hash = self._get_folder_hash(folder_path)
259
+ folder_hash = self.get_folder_hash(folder_path)
190
260
  return self.cache_dir / f"index_{folder_hash}"
191
-
192
- def _check_folder_modified(self, folder_path: str) -> bool:
261
+
262
+ def _check_folder_modified(self, folder_path: Path) -> bool:
193
263
  """Check if a folder has been modified since it was last indexed"""
194
- folder_path = Path(folder_path)
195
-
264
+
265
+ @staticmethod
266
+ def _get_all_files(folder_path: Path) -> Iterable[Path]:
267
+ """
268
+ Gets all the files, recursively, in a folder.
269
+
270
+ Args:
271
+ folder_path: The path to the folder.
272
+
273
+ Yields:
274
+ Each of the files in the folder.
275
+
276
+ """
277
+ for root, _, files in os.walk(folder_path):
278
+ for file in files:
279
+ yield Path(root) / file
280
+
281
+ def _get_modified_files(self, folder_path: Path) -> List[Path]:
282
+ """
283
+ Gets the files in a folder that have been modified since it was last
284
+ indexed.
285
+
286
+ Args:
287
+ folder_path: The path to the folder to check.
288
+
289
+ Returns:
290
+ A list of the files that were modified.
291
+
292
+ """
196
293
  if not folder_path.exists() or not folder_path.is_dir():
197
- return False
198
-
199
- folder_hash = self._get_folder_hash(str(folder_path))
200
-
201
- # If folder has never been indexed, it's considered modified
294
+ return []
295
+
296
+ folder_hash = self.get_folder_hash(folder_path)
297
+
202
298
  if folder_hash not in self.indexed_folders:
203
- return True
204
-
205
- last_indexed = self.indexed_folders[folder_hash].get("last_indexed", 0)
206
-
299
+ # If folder has never been indexed, everything has been modified.
300
+ last_indexed = 0
301
+ indexed_files = set()
302
+ else:
303
+ last_indexed = self.indexed_folders[folder_hash].get("last_indexed", 0)
304
+ indexed_files = (
305
+ self.indexed_folders[folder_hash].get("indexed_files", {}).keys()
306
+ )
307
+
207
308
  # Check if any file in the folder has been modified since last indexing
208
- for root, _, files in os.walk(folder_path):
209
- for file in files:
210
- file_path = Path(root) / file
211
- if file_path.stat().st_mtime > last_indexed:
212
- return True
213
-
214
- return False
309
+ modified_files = []
310
+ for file_path in self._get_all_files(folder_path):
311
+ file_stats = file_path.stat()
312
+ if file_stats.st_mtime > last_indexed:
313
+ modified_files.append(file_path)
314
+ elif str(file_path.relative_to(folder_path)) not in indexed_files:
315
+ # This file somehow never got indexed.
316
+ modified_files.append(file_path)
215
317
 
216
- def _check_config_changed(self, folder_path: str) -> bool:
318
+ return modified_files
319
+
320
+ def _check_config_changed(self, folder_path: Path) -> bool:
217
321
  """
218
322
  Checks if the embedding configuration for a folder has been changed
219
323
  since it was last indexed.
220
324
  """
221
- folder_hash = self._get_folder_hash(folder_path)
325
+ folder_hash = self.get_folder_hash(folder_path)
222
326
 
223
327
  if folder_hash not in self.indexed_folders:
224
328
  # It hasn't been indexed at all. That's a new configuration,
@@ -231,228 +335,225 @@ class LocalEmbeddingManager:
231
335
  embedding_model = embedding_config.get("embedding_model", "")
232
336
 
233
337
  if (chunk_size, chunk_overlap, embedding_model) != (
234
- self.chunk_size, self.chunk_overlap, self.embedding_model
338
+ self.chunk_size,
339
+ self.chunk_overlap,
340
+ self.embedding_model,
235
341
  ):
236
- logger.info(
237
- "Embedding configuration has changed, re-indexing folder."
238
- )
342
+ logger.info("Embedding configuration has changed, re-indexing folder.")
239
343
  return True
240
344
  return False
241
-
242
- def get_file_loader(self, file_path: str) -> Optional[BaseLoader]:
243
- """Get an appropriate document loader for a file based on its extension"""
244
- file_path = Path(file_path)
245
- extension = file_path.suffix.lower()
246
-
247
- try:
248
- if extension == ".pdf":
249
- return PyPDFLoader(str(file_path))
250
- elif extension == ".txt":
251
- return TextLoader(str(file_path))
252
- elif extension in [".md", ".markdown"]:
253
- return UnstructuredMarkdownLoader(str(file_path))
254
- elif extension in [".doc", ".docx"]:
255
- return UnstructuredWordDocumentLoader(str(file_path))
256
- elif extension == ".csv":
257
- return CSVLoader(str(file_path))
258
- elif extension in [".xls", ".xlsx"]:
259
- return UnstructuredExcelLoader(str(file_path))
260
- else:
261
- # Try the text loader as a fallback for unknown extensions
262
- logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
263
- return TextLoader(str(file_path), encoding="utf-8")
264
- except Exception as e:
265
- logger.error(f"Error creating loader for {file_path}: {e}")
266
- return None
267
-
345
+
268
346
  def index_folder(self, folder_path: str, force_reindex: bool = False) -> bool:
269
347
  """
270
348
  Index all documents in a folder for vector search.
271
-
349
+
272
350
  Args:
273
351
  folder_path: Path to the folder to index
274
352
  force_reindex: Whether to force reindexing even if unchanged
275
-
353
+
276
354
  Returns:
277
355
  bool: True if indexing was successful, False otherwise
278
356
  """
279
357
  folder_path = Path(folder_path)
280
-
358
+
281
359
  # Validate folder
282
360
  if not folder_path.exists():
283
361
  logger.error(f"Folder not found: {folder_path}")
284
362
  return False
285
-
363
+
286
364
  if not folder_path.is_dir():
287
365
  logger.error(f"Path is not a directory: {folder_path}")
288
366
  return False
289
-
367
+
290
368
  folder_str = str(folder_path)
291
- folder_hash = self._get_folder_hash(folder_str)
292
- index_path = self._get_index_path(folder_str)
293
-
294
- # Check if folder needs to be reindexed
295
- if (not force_reindex and not self._check_folder_modified(folder_str)
296
- and not self._check_config_changed(folder_str)):
297
- logger.info(f"Folder {folder_path} has not been modified since last indexing")
298
-
299
- # Load the vector store from disk if not already loaded
300
- if folder_hash not in self.vector_stores:
301
- try:
302
- self.vector_stores[folder_hash] = FAISS.load_local(
303
- str(index_path),
304
- self.embeddings,
305
- allow_dangerous_deserialization=True,
306
- normalize_L2=True,
307
- )
308
- logger.info(f"Loaded index for {folder_path} from disk")
309
- except Exception as e:
310
- logger.error(f"Error loading index for {folder_path}: {e}")
311
- # If loading fails, force reindexing
312
- force_reindex = True
313
- else:
314
- logger.info(f"Using cached index for {folder_path}")
315
-
316
- # If no reindexing is needed and vector store loaded successfully
317
- if not force_reindex and folder_hash in self.vector_stores:
318
- return True
319
-
369
+ folder_hash = self.get_folder_hash(folder_path)
370
+ index_path = self._get_index_path(folder_path)
371
+
372
+ if force_reindex or self._check_config_changed(folder_path):
373
+ logger.info(f"Re-indexing entire folder: {folder_path}")
374
+ modified_files = list(self._get_all_files(folder_path))
375
+ else:
376
+ # Just re-index the modified files if we can get away with it.
377
+ modified_files = self._get_modified_files(folder_path)
378
+ logger.info(f"Re-indexing {len(modified_files)} modified files...")
379
+
380
+ # Load the vector store from disk if not already loaded
381
+ if folder_hash not in self.vector_stores and index_path.exists():
382
+ try:
383
+ self.vector_stores[folder_hash] = FAISS.load_local(
384
+ str(index_path),
385
+ self.embeddings,
386
+ allow_dangerous_deserialization=True,
387
+ normalize_L2=True,
388
+ )
389
+ logger.info(f"Loaded index for {folder_path} from disk")
390
+ except Exception as e:
391
+ logger.error(f"Error loading index for {folder_path}: {e}")
392
+ # If loading fails, force reindexing
393
+ force_reindex = True
394
+
320
395
  logger.info(f"Indexing folder: {folder_path}")
321
396
  start_time = time.time()
322
-
397
+
323
398
  # Find documents to index
324
399
  all_docs = []
325
- file_count = 0
326
- error_count = 0
327
-
328
- for root, _, files in os.walk(folder_path):
329
- for file in files:
330
- file_path = Path(root) / file
331
-
332
- # Skip hidden files and directories
333
- if file.startswith(".") or any(part.startswith(".") for part in file_path.parts):
334
- continue
335
-
336
- # Get a loader for this file
337
- loader = self.get_file_loader(str(file_path))
338
-
339
- if loader:
340
- try:
341
- # Load the document
342
- docs = loader.load()
343
-
344
- # Add source path metadata
345
- for doc in docs:
346
- doc.metadata["source"] = str(file_path)
347
- doc.metadata["filename"] = file
348
-
349
- all_docs.extend(docs)
350
- file_count += 1
351
- except Exception as e:
352
- logger.error(f"Error loading {file_path}: {e}")
353
- error_count += 1
354
-
355
- if not all_docs:
356
- logger.warning(f"No documents found in {folder_path} or all documents failed to load")
357
- return False
358
-
400
+
401
+ # Remove hidden files and directories.
402
+ modified_files = [
403
+ p
404
+ for p in modified_files
405
+ if not p.name.startswith(".")
406
+ and not any(part.startswith(".") for part in p.parts)
407
+ ]
408
+ # Index them.
409
+ with ProcessPoolExecutor() as executor:
410
+ all_docs_nested = executor.map(_load_document, modified_files)
411
+ # Flatten the result.
412
+ for docs in all_docs_nested:
413
+ all_docs.extend(docs)
414
+
415
+ if force_reindex or folder_hash not in self.vector_stores:
416
+ logger.info(f"Creating new index for {folder_path}")
417
+ # Embed a test query to figure out embedding length.
418
+ test_embedding = self.embeddings.embed_query("hello world")
419
+ index = IndexFlatL2(len(test_embedding))
420
+ self.vector_stores[folder_hash] = FAISS(
421
+ self.embeddings,
422
+ index=index,
423
+ docstore=InMemoryDocstore(),
424
+ index_to_docstore_id={},
425
+ normalize_L2=True,
426
+ )
427
+
359
428
  # Split documents into chunks
360
429
  logger.info(f"Splitting {len(all_docs)} documents into chunks")
361
430
  splits = self.text_splitter.split_documents(all_docs)
362
- logger.info(f"Created {len(splits)} chunks from {file_count} files")
363
-
431
+ logger.info(f"Created {len(splits)} chunks from {len(modified_files)} files")
432
+
364
433
  # Create vector store
365
- logger.info(f"Creating vector store with {len(splits)} chunks")
366
- vector_store = FAISS.from_documents(
367
- splits,
368
- self.embeddings,
369
- normalize_L2=True
370
- )
371
-
434
+ ids = []
435
+ if splits:
436
+ logger.info(f"Adding {len(splits)} chunks to vector store")
437
+ ids = [uuid.uuid4().hex for _ in splits]
438
+ self.vector_stores[folder_hash].add_documents(splits, ids=ids)
439
+
440
+ # Update indexing time for individual files.
441
+ index_time = time.time()
442
+ indexed_files = {}
443
+ if folder_hash in self.indexed_folders:
444
+ indexed_files = (
445
+ self.indexed_folders[folder_hash].get("indexed_files", {}).copy()
446
+ )
447
+ for split_id, split in zip(ids, splits):
448
+ split_source = str(Path(split.metadata["source"]).relative_to(folder_path))
449
+ id_list = indexed_files.setdefault(split_source, [])
450
+ id_list.append(split_id)
451
+
452
+ # Check for any files that were removed and remove them from the
453
+ # vector store.
454
+ delete_ids = []
455
+ delete_paths = []
456
+ for relative_path, chunk_ids in indexed_files.items():
457
+ if not (folder_path / Path(relative_path)).exists():
458
+ delete_ids.extend(chunk_ids)
459
+ delete_paths.append(relative_path)
460
+ if delete_ids:
461
+ logger.info(
462
+ f"Deleting {len(delete_paths)} non-existent files from the " f"index."
463
+ )
464
+ self.vector_stores[folder_hash].delete(delete_ids)
465
+ for path in delete_paths:
466
+ del indexed_files[path]
467
+
372
468
  # Save the vector store to disk
373
469
  logger.info(f"Saving index to {index_path}")
374
- vector_store.save_local(str(index_path))
375
-
376
- # Update cache
377
- self.vector_stores[folder_hash] = vector_store
378
-
470
+ self.vector_stores[folder_hash].save_local(str(index_path))
471
+
379
472
  # Update metadata
380
473
  self.indexed_folders[folder_hash] = {
381
474
  "path": folder_str,
382
- "last_indexed": time.time(),
383
- "file_count": file_count,
475
+ "last_indexed": index_time,
476
+ "file_count": len(modified_files),
384
477
  "chunk_count": len(splits),
385
- "error_count": error_count,
386
478
  "embedding_model": self.embedding_model,
387
479
  "chunk_size": self.chunk_size,
388
- "chunk_overlap": self.chunk_overlap
480
+ "chunk_overlap": self.chunk_overlap,
481
+ "indexed_files": indexed_files,
389
482
  }
390
-
483
+
391
484
  # Save updated metadata
392
485
  self._save_indexed_folders()
393
-
486
+
394
487
  elapsed_time = time.time() - start_time
395
- logger.info(f"Indexed {file_count} files in {elapsed_time:.2f} seconds")
396
-
488
+ logger.info(
489
+ f"Indexed {len(modified_files)} files in {elapsed_time:.2f} seconds"
490
+ )
491
+
397
492
  return True
398
-
493
+
399
494
  def search(
400
- self,
401
- query: str,
495
+ self,
496
+ query: str,
402
497
  folder_paths: List[str],
403
498
  limit: int = 10,
404
499
  score_threshold: float = 0.0,
405
500
  ) -> List[Dict[str, Any]]:
406
501
  """
407
502
  Search for documents relevant to a query across specified folders.
408
-
503
+
409
504
  Args:
410
505
  query: The search query
411
506
  folder_paths: List of folder paths to search in
412
507
  limit: Maximum number of results to return
413
508
  score_threshold: Minimum similarity score threshold
414
-
509
+
415
510
  Returns:
416
511
  List of results with document content and metadata
417
512
  """
513
+ folder_paths = [Path(p) for p in folder_paths]
514
+
418
515
  # Add detailed debugging for each folder
419
516
  for folder_path in folder_paths:
420
- folder_hash = self._get_folder_hash(folder_path)
517
+ folder_hash = self.get_folder_hash(folder_path)
421
518
  index_path = self._get_index_path(folder_path)
422
-
519
+
423
520
  logger.info(f"Diagnostic for {folder_path}:")
424
521
  logger.info(f" - Folder hash: {folder_hash}")
425
522
  logger.info(f" - Index path: {index_path}")
426
523
  logger.info(f" - Index exists on disk: {index_path.exists()}")
427
- logger.info(f" - Is in indexed_folders: {folder_hash in self.indexed_folders}")
428
-
524
+ logger.info(
525
+ f" - Is in indexed_folders: {folder_hash in self.indexed_folders}"
526
+ )
527
+
429
528
  if folder_hash in self.indexed_folders:
430
529
  meta = self.indexed_folders[folder_hash]
431
- logger.info(f" - Metadata: file_count={meta.get('file_count', 0)}, chunk_count={meta.get('chunk_count', 0)}")
432
-
530
+ logger.info(
531
+ f" - Metadata: file_count={meta.get('file_count', 0)}, chunk_count={meta.get('chunk_count', 0)}"
532
+ )
533
+
433
534
  # Validate folders exist
434
535
  valid_folder_paths = []
435
536
  for path in folder_paths:
436
- if os.path.exists(path) and os.path.isdir(path):
537
+ if path.exists() and path.is_dir():
437
538
  valid_folder_paths.append(path)
438
539
  else:
439
540
  logger.warning(f"Skipping non-existent folder in search: {path}")
440
-
541
+
441
542
  # If no valid folders, return empty results
442
543
  if not valid_folder_paths:
443
544
  logger.warning(f"No valid folders to search among: {folder_paths}")
444
545
  return []
445
-
546
+
446
547
  all_results = []
447
-
548
+
448
549
  for folder_path in valid_folder_paths:
449
- folder_hash = self._get_folder_hash(folder_path)
450
-
550
+ folder_hash = self.get_folder_hash(folder_path)
551
+
451
552
  # Skip folders that haven't been indexed
452
553
  if folder_hash not in self.indexed_folders:
453
554
  logger.warning(f"Folder {folder_path} has not been indexed")
454
555
  continue
455
-
556
+
456
557
  # Make sure the vector store is loaded
457
558
  if folder_hash not in self.vector_stores:
458
559
  index_path = self._get_index_path(folder_path)
@@ -461,74 +562,71 @@ class LocalEmbeddingManager:
461
562
  str(index_path),
462
563
  self.embeddings,
463
564
  allow_dangerous_deserialization=True,
464
- nomalize_L2=True
565
+ nomalize_L2=True,
465
566
  )
466
567
  except Exception as e:
467
568
  logger.error(f"Error loading index for {folder_path}: {e}")
468
569
  continue
469
-
570
+
470
571
  # Search in this folder
471
572
  vector_store = self.vector_stores[folder_hash]
472
-
573
+
473
574
  try:
474
- docs_with_scores = (
475
- vector_store.similarity_search_with_relevance_scores(
476
- query,
477
- k=limit
478
- )
575
+ docs_with_scores = vector_store.similarity_search_with_relevance_scores(
576
+ query, k=limit
479
577
  )
480
-
578
+
481
579
  for doc, similarity in docs_with_scores:
482
580
  # Skip results below the threshold
483
581
  if similarity < score_threshold:
484
582
  continue
485
-
583
+
486
584
  result = {
487
585
  "content": doc.page_content,
488
586
  "metadata": doc.metadata,
489
587
  "similarity": float(similarity),
490
- "folder": folder_path
588
+ "folder": folder_path,
491
589
  }
492
-
590
+
493
591
  all_results.append(result)
494
592
  except Exception as e:
495
593
  logger.error(f"Error searching in {folder_path}: {e}")
496
-
594
+
497
595
  # Sort by similarity (highest first)
498
596
  all_results.sort(key=lambda x: x["similarity"], reverse=True)
499
-
597
+
500
598
  # Limit to the requested number
501
599
  return all_results[:limit]
502
-
600
+
503
601
  def clear_cache(self):
504
602
  """Clear all cached vector stores from memory (not disk)"""
505
603
  self.vector_stores.clear()
506
-
604
+
507
605
  def get_indexed_folders_info(self) -> List[Dict[str, Any]]:
508
606
  """Get information about all indexed folders"""
509
607
  info = []
510
-
608
+
511
609
  for folder_hash, metadata in self.indexed_folders.items():
512
610
  folder_info = metadata.copy()
513
-
611
+
514
612
  # Add formatted last indexed time
515
613
  if "last_indexed" in folder_info:
516
614
  folder_info["last_indexed_formatted"] = datetime.fromtimestamp(
517
615
  folder_info["last_indexed"]
518
616
  ).strftime("%Y-%m-%d %H:%M:%S")
519
-
617
+
520
618
  # Check if index file exists
521
- index_path = self._get_index_path(folder_info["path"])
619
+ index_path = self._get_index_path(Path(folder_info["path"]))
522
620
  folder_info["index_exists"] = index_path.exists()
523
-
621
+
524
622
  info.append(folder_info)
525
-
623
+
526
624
  return info
527
625
 
528
626
 
529
627
  class LocalSearchEngine(BaseSearchEngine):
530
628
  """Local document search engine with two-phase retrieval"""
531
-
629
+
532
630
  def __init__(
533
631
  self,
534
632
  paths: List[str],
@@ -547,7 +645,7 @@ class LocalSearchEngine(BaseSearchEngine):
547
645
  ):
548
646
  """
549
647
  Initialize the local search engine.
550
-
648
+
551
649
  Args:
552
650
  paths: List of folder paths to search in
553
651
  llm: Language model for relevance filtering
@@ -565,7 +663,7 @@ class LocalSearchEngine(BaseSearchEngine):
565
663
  """
566
664
  # Initialize the base search engine
567
665
  super().__init__(llm=llm, max_filtered_results=max_filtered_results)
568
-
666
+
569
667
  # Validate folder paths
570
668
  self.folder_paths = paths
571
669
  self.valid_folder_paths = []
@@ -574,15 +672,19 @@ class LocalSearchEngine(BaseSearchEngine):
574
672
  self.valid_folder_paths.append(path)
575
673
  else:
576
674
  logger.warning(f"Folder not found or is not a directory: {path}")
577
-
675
+
578
676
  # If no valid folders, log a clear message
579
677
  if not self.valid_folder_paths and paths:
580
678
  logger.warning(f"No valid folders found among: {paths}")
581
- logger.warning("This search engine will return no results until valid folders are configured")
582
-
679
+ logger.warning(
680
+ "This search engine will return no results until valid folders are configured"
681
+ )
682
+
583
683
  self.max_results = max_results
584
- self.collections = collections or {"default": {"paths": paths, "description": "Default collection"}}
585
-
684
+ self.collections = collections or {
685
+ "default": {"paths": paths, "description": "Default collection"}
686
+ }
687
+
586
688
  # Initialize the embedding manager with only valid folders
587
689
  self.embedding_manager = LocalEmbeddingManager(
588
690
  embedding_model=embedding_model,
@@ -591,339 +693,400 @@ class LocalSearchEngine(BaseSearchEngine):
591
693
  ollama_base_url=ollama_base_url,
592
694
  chunk_size=chunk_size,
593
695
  chunk_overlap=chunk_overlap,
594
- cache_dir=cache_dir
696
+ cache_dir=cache_dir,
595
697
  )
596
-
698
+
597
699
  # Index all folders
598
700
  self._index_folders(force_reindex)
599
-
701
+
600
702
  def _index_folders(self, force_reindex: bool = False):
601
703
  """Index all valid configured folders"""
602
704
  indexed = []
603
705
  failed = []
604
706
  skipped = []
605
-
707
+
606
708
  # Keep track of invalid folders
607
709
  for folder in self.folder_paths:
608
710
  if folder not in self.valid_folder_paths:
609
711
  skipped.append(folder)
610
712
  continue
611
-
713
+
612
714
  success = self.embedding_manager.index_folder(folder, force_reindex)
613
715
  if success:
614
716
  indexed.append(folder)
615
717
  else:
616
718
  failed.append(folder)
617
-
719
+
618
720
  if indexed:
619
- logger.info(f"Successfully indexed {len(indexed)} folders: {', '.join(indexed)}")
620
-
721
+ logger.info(
722
+ f"Successfully indexed {len(indexed)} folders: {', '.join(indexed)}"
723
+ )
724
+
621
725
  if failed:
622
- logger.warning(f"Failed to index {len(failed)} folders: {', '.join(failed)}")
623
-
726
+ logger.warning(
727
+ f"Failed to index {len(failed)} folders: {', '.join(failed)}"
728
+ )
729
+
624
730
  if skipped:
625
- logger.warning(f"Skipped {len(skipped)} invalid folders: {', '.join(skipped)}")
626
-
627
- def _get_previews(self, query: str, collection_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
731
+ logger.warning(
732
+ f"Skipped {len(skipped)} invalid folders: {', '.join(skipped)}"
733
+ )
734
+
735
+ def _get_previews(
736
+ self, query: str, collection_names: Optional[List[str]] = None
737
+ ) -> List[Dict[str, Any]]:
628
738
  """
629
739
  Get preview information for documents matching the query.
630
-
740
+
631
741
  Args:
632
742
  query: The search query
633
743
  collection_names: Specific collections to search within (if None, search all)
634
-
744
+
635
745
  Returns:
636
746
  List of preview dictionaries
637
747
  """
638
748
  # Determine which collections to search
639
749
  if collection_names:
640
750
  # Search only in specified collections
641
- collections_to_search = {name: self.collections[name] for name in collection_names
642
- if name in self.collections}
751
+ collections_to_search = {
752
+ name: self.collections[name]
753
+ for name in collection_names
754
+ if name in self.collections
755
+ }
643
756
  if not collections_to_search:
644
757
  logger.warning(f"No valid collections found among: {collection_names}")
645
758
  return []
646
759
  else:
647
760
  # Search in all collections
648
761
  collections_to_search = self.collections
649
-
762
+
650
763
  # Extract all folder paths from the collections to search
651
764
  search_paths = []
652
765
  for collection_config in collections_to_search.values():
653
766
  if "paths" in collection_config:
654
767
  search_paths.extend(collection_config["paths"])
655
-
656
- logger.info(f"Searching local documents in collections: {list(collections_to_search.keys())}")
657
-
768
+
769
+ logger.info(
770
+ f"Searching local documents in collections: {list(collections_to_search.keys())}"
771
+ )
772
+
658
773
  # Filter out invalid paths
659
- valid_search_paths = [path for path in search_paths if path in self.valid_folder_paths]
660
-
774
+ valid_search_paths = [
775
+ path for path in search_paths if path in self.valid_folder_paths
776
+ ]
777
+
661
778
  if not valid_search_paths:
662
- logger.warning(f"No valid folders to search in collections: {list(collections_to_search.keys())}")
779
+ logger.warning(
780
+ f"No valid folders to search in collections: {list(collections_to_search.keys())}"
781
+ )
663
782
  return []
664
-
783
+
665
784
  # Search across the valid selected folders
666
785
  raw_results = self.embedding_manager.search(
667
786
  query=query,
668
787
  folder_paths=valid_search_paths,
669
788
  limit=self.max_results,
670
- score_threshold=0.1 # Skip very low relevance results
789
+ score_threshold=0.1, # Skip very low relevance results
671
790
  )
672
-
791
+
673
792
  if not raw_results:
674
793
  logger.info(f"No local documents found for query: {query}")
675
794
  return []
676
-
795
+
677
796
  # Convert to preview format
678
797
  previews = []
679
798
  for i, result in enumerate(raw_results):
680
799
  # Create a unique ID
681
- result_id = f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
682
-
800
+ result_id = (
801
+ f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
802
+ )
803
+
683
804
  # Extract filename and path
684
- source_path = result['metadata'].get('source', 'Unknown')
685
- filename = result['metadata'].get('filename', os.path.basename(source_path))
686
-
805
+ source_path = result["metadata"].get("source", "Unknown")
806
+ filename = result["metadata"].get("filename", os.path.basename(source_path))
807
+
687
808
  # Create preview snippet (first ~200 chars of content)
688
- snippet = result['content'][:200] + "..." if len(result['content']) > 200 else result['content']
689
-
809
+ snippet = (
810
+ result["content"][:200] + "..."
811
+ if len(result["content"]) > 200
812
+ else result["content"]
813
+ )
814
+
690
815
  # Determine which collection this document belongs to
691
816
  collection_name = "Unknown"
692
- folder_path = result['folder']
817
+ folder_path = result["folder"]
693
818
  for name, collection in self.collections.items():
694
- if any(folder_path.startswith(path) for path in collection.get("paths", [])):
695
- collection_name = name
819
+ if any(
820
+ folder_path.is_relative_to(path)
821
+ for path in collection.get("paths", [])
822
+ ):
696
823
  break
697
-
824
+
698
825
  # Format the preview
699
826
  preview = {
700
827
  "id": result_id,
701
828
  "title": filename,
702
829
  "snippet": snippet,
703
830
  "link": source_path,
704
- "similarity": result['similarity'],
705
- "folder": folder_path,
831
+ "similarity": result["similarity"],
832
+ "folder": folder_path.as_posix(),
706
833
  "collection": collection_name,
707
- "collection_description": self.collections.get(collection_name, {}).get("description", ""),
708
- "_full_content": result['content'], # Store full content for later
709
- "_metadata": result['metadata'] # Store metadata for later
834
+ "collection_description": self.collections.get(collection_name, {}).get(
835
+ "description", ""
836
+ ),
837
+ "_full_content": result["content"], # Store full content for later
838
+ "_metadata": result["metadata"], # Store metadata for later
710
839
  }
711
-
840
+
712
841
  previews.append(preview)
713
-
842
+
714
843
  logger.info(f"Found {len(previews)} local document matches")
715
844
  return previews
716
-
717
- def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
845
+
846
+ def _get_full_content(
847
+ self, relevant_items: List[Dict[str, Any]]
848
+ ) -> List[Dict[str, Any]]:
718
849
  """
719
850
  Get full content for the relevant documents.
720
851
  For local search, the full content is already available.
721
-
852
+
722
853
  Args:
723
854
  relevant_items: List of relevant preview dictionaries
724
-
855
+
725
856
  Returns:
726
857
  List of result dictionaries with full content
727
858
  """
728
859
  # Check if we should add full content
729
- if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
860
+ if (
861
+ hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
862
+ and search_config.SEARCH_SNIPPETS_ONLY
863
+ ):
730
864
  logger.info("Snippet-only mode, skipping full content addition")
731
865
  return relevant_items
732
-
866
+
733
867
  # For local search, we already have the full content
734
868
  results = []
735
869
  for item in relevant_items:
736
870
  # Create a copy with full content
737
871
  result = item.copy()
738
-
872
+
739
873
  # Add full content if we have it
740
874
  if "_full_content" in item:
741
875
  result["content"] = item["_full_content"]
742
876
  result["full_content"] = item["_full_content"]
743
-
877
+
744
878
  # Remove temporary fields
745
879
  if "_full_content" in result:
746
880
  del result["_full_content"]
747
-
881
+
748
882
  # Add metadata if we have it
749
883
  if "_metadata" in item:
750
884
  result["document_metadata"] = item["_metadata"]
751
-
885
+
752
886
  # Remove temporary fields
753
887
  if "_metadata" in result:
754
888
  del result["_metadata"]
755
-
889
+
756
890
  results.append(result)
757
-
891
+
758
892
  return results
759
-
760
- def run(self, query: str, collection_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
893
+
894
+ def run(
895
+ self, query: str, collection_names: Optional[List[str]] = None
896
+ ) -> List[Dict[str, Any]]:
761
897
  """
762
898
  Execute a search using the two-phase approach.
763
-
899
+
764
900
  Args:
765
901
  query: The search query
766
902
  collection_names: Specific collections to search within (if None, search all)
767
-
903
+
768
904
  Returns:
769
905
  List of search result dictionaries with full content
770
906
  """
771
- logger.info(f"---Execute a search using Local Documents---")
772
-
907
+ logger.info("---Execute a search using Local Documents---")
908
+
773
909
  # Check if we have any special collection parameters in the query
774
910
  collection_prefix = "collection:"
775
911
  remaining_query = query
776
912
  specified_collections = []
777
-
913
+
778
914
  # Parse query for collection specifications like "collection:research_papers query terms"
779
915
  query_parts = query.split()
780
916
  for part in query_parts:
781
917
  if part.lower().startswith(collection_prefix):
782
- collection_name = part[len(collection_prefix):].strip()
918
+ collection_name = part[len(collection_prefix) :].strip()
783
919
  if collection_name in self.collections:
784
920
  specified_collections.append(collection_name)
785
921
  # Remove this part from the query
786
922
  remaining_query = remaining_query.replace(part, "", 1).strip()
787
-
923
+
788
924
  # If collections were specified in the query, they override the parameter
789
925
  if specified_collections:
790
926
  collection_names = specified_collections
791
927
  query = remaining_query
792
-
928
+
793
929
  # Phase 1: Get previews (with collection filtering)
794
930
  previews = self._get_previews(query, collection_names)
795
-
931
+
796
932
  if not previews:
797
933
  return []
798
-
934
+
799
935
  # Phase 2: Filter for relevance
800
936
  relevant_items = self._filter_for_relevance(previews, query)
801
-
937
+
802
938
  if not relevant_items:
803
939
  return []
804
-
940
+
805
941
  # Phase 3: Get full content for relevant items
806
- if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
942
+ if (
943
+ hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
944
+ and search_config.SEARCH_SNIPPETS_ONLY
945
+ ):
807
946
  logger.info("Returning snippet-only results as per config")
808
947
  results = relevant_items
809
948
  else:
810
949
  results = self._get_full_content(relevant_items)
811
-
950
+
812
951
  # Clean up temporary data
813
952
  self.embedding_manager.clear_cache()
814
-
953
+
815
954
  return results
816
-
955
+
817
956
  def get_collections_info(self) -> List[Dict[str, Any]]:
818
957
  """
819
958
  Get information about all collections, including indexing status.
820
-
959
+
821
960
  Returns:
822
961
  List of collection information dictionaries
823
962
  """
824
963
  collections_info = []
825
-
964
+
826
965
  for name, collection in self.collections.items():
827
966
  paths = collection.get("paths", [])
967
+ paths = [Path(p) for p in paths]
828
968
  description = collection.get("description", "")
829
-
969
+
830
970
  # Get indexing information for each path
831
971
  paths_info = []
832
972
  for path in paths:
833
973
  # Check if folder exists
834
- exists = os.path.exists(path) and os.path.isdir(path)
835
-
974
+ exists = path.exists() and path.is_dir()
975
+
836
976
  # Check if folder is indexed
837
- folder_hash = self.embedding_manager._get_folder_hash(path)
977
+ folder_hash = self.embedding_manager.get_folder_hash(path)
838
978
  indexed = folder_hash in self.embedding_manager.indexed_folders
839
-
979
+
840
980
  # Get index details if available
841
981
  index_info = {}
842
982
  if indexed:
843
- index_info = self.embedding_manager.indexed_folders[folder_hash].copy()
844
-
845
- paths_info.append({
846
- "path": path,
847
- "exists": exists,
848
- "indexed": indexed,
849
- "index_info": index_info
850
- })
851
-
852
- collections_info.append({
853
- "name": name,
854
- "description": description,
855
- "paths": paths,
856
- "paths_info": paths_info,
857
- "document_count": sum(info.get("index_info", {}).get("file_count", 0) for info in paths_info),
858
- "chunk_count": sum(info.get("index_info", {}).get("chunk_count", 0) for info in paths_info),
859
- "all_indexed": all(info["indexed"] for info in paths_info if info["exists"])
860
- })
861
-
983
+ index_info = self.embedding_manager.indexed_folders[
984
+ folder_hash
985
+ ].copy()
986
+
987
+ paths_info.append(
988
+ {
989
+ "path": path,
990
+ "exists": exists,
991
+ "indexed": indexed,
992
+ "index_info": index_info,
993
+ }
994
+ )
995
+
996
+ collections_info.append(
997
+ {
998
+ "name": name,
999
+ "description": description,
1000
+ "paths": paths,
1001
+ "paths_info": paths_info,
1002
+ "document_count": sum(
1003
+ info.get("index_info", {}).get("file_count", 0)
1004
+ for info in paths_info
1005
+ ),
1006
+ "chunk_count": sum(
1007
+ info.get("index_info", {}).get("chunk_count", 0)
1008
+ for info in paths_info
1009
+ ),
1010
+ "all_indexed": all(
1011
+ info["indexed"] for info in paths_info if info["exists"]
1012
+ ),
1013
+ }
1014
+ )
1015
+
862
1016
  return collections_info
863
-
1017
+
864
1018
  def reindex_collection(self, collection_name: str) -> bool:
865
1019
  """
866
1020
  Reindex a specific collection.
867
-
1021
+
868
1022
  Args:
869
1023
  collection_name: Name of the collection to reindex
870
-
1024
+
871
1025
  Returns:
872
1026
  True if reindexing was successful, False otherwise
873
1027
  """
874
1028
  if collection_name not in self.collections:
875
1029
  logger.error(f"Collection '{collection_name}' not found")
876
1030
  return False
877
-
1031
+
878
1032
  paths = self.collections[collection_name].get("paths", [])
879
1033
  success = True
880
-
1034
+
881
1035
  for path in paths:
882
1036
  if not self.embedding_manager.index_folder(path, force_reindex=True):
883
1037
  success = False
884
-
1038
+
885
1039
  return success
886
-
1040
+
887
1041
  @classmethod
888
- def from_config(cls, config_dict: Dict[str, Any], llm: Optional[BaseLLM] = None) -> "LocalSearchEngine":
1042
+ def from_config(
1043
+ cls, config_dict: Dict[str, Any], llm: Optional[BaseLLM] = None
1044
+ ) -> "LocalSearchEngine":
889
1045
  """
890
1046
  Create a LocalSearchEngine instance from a configuration dictionary.
891
-
1047
+
892
1048
  Args:
893
1049
  config_dict: Configuration dictionary
894
1050
  llm: Language model for relevance filtering
895
-
1051
+
896
1052
  Returns:
897
1053
  Initialized LocalSearchEngine instance
898
1054
  """
899
1055
  # Required parameters
900
1056
  folder_paths = []
901
1057
  collections = config_dict.get("collections", {})
902
-
1058
+
903
1059
  # Extract all folder paths from collections
904
1060
  for collection_config in collections.values():
905
1061
  if "paths" in collection_config:
906
1062
  folder_paths.extend(collection_config["paths"])
907
-
1063
+
908
1064
  # Fall back to folder_paths if no collections defined
909
1065
  if not folder_paths:
910
1066
  folder_paths = config_dict.get("folder_paths", [])
911
1067
  # Create a default collection if using folder_paths
912
1068
  if folder_paths:
913
- collections = {"default": {"paths": folder_paths, "description": "Default collection"}}
914
-
1069
+ collections = {
1070
+ "default": {
1071
+ "paths": folder_paths,
1072
+ "description": "Default collection",
1073
+ }
1074
+ }
1075
+
915
1076
  # Optional parameters with defaults
916
1077
  max_results = config_dict.get("max_results", 10)
917
1078
  max_filtered_results = config_dict.get("max_filtered_results")
918
1079
  embedding_model = config_dict.get("embedding_model", "all-MiniLM-L6-v2")
919
1080
  embedding_device = config_dict.get("embedding_device", "cpu")
920
- embedding_model_type = config_dict.get("embedding_model_type", "sentence_transformers")
1081
+ embedding_model_type = config_dict.get(
1082
+ "embedding_model_type", "sentence_transformers"
1083
+ )
921
1084
  ollama_base_url = config_dict.get("ollama_base_url")
922
1085
  force_reindex = config_dict.get("force_reindex", False)
923
1086
  chunk_size = config_dict.get("chunk_size", 1000)
924
1087
  chunk_overlap = config_dict.get("chunk_overlap", 200)
925
1088
  cache_dir = config_dict.get("cache_dir", ".cache/local_search")
926
-
1089
+
927
1090
  return cls(
928
1091
  paths=folder_paths,
929
1092
  collections=collections,
@@ -937,5 +1100,5 @@ class LocalSearchEngine(BaseSearchEngine):
937
1100
  force_reindex=force_reindex,
938
1101
  chunk_size=chunk_size,
939
1102
  chunk_overlap=chunk_overlap,
940
- cache_dir=cache_dir
1103
+ cache_dir=cache_dir,
941
1104
  )