local-deep-research 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. local_deep_research/__init__.py +24 -0
  2. local_deep_research/citation_handler.py +113 -0
  3. local_deep_research/config.py +166 -0
  4. local_deep_research/defaults/__init__.py +44 -0
  5. local_deep_research/defaults/llm_config.py +269 -0
  6. local_deep_research/defaults/local_collections.toml +47 -0
  7. local_deep_research/defaults/main.toml +57 -0
  8. local_deep_research/defaults/search_engines.toml +244 -0
  9. local_deep_research/local_collections.py +141 -0
  10. local_deep_research/main.py +113 -0
  11. local_deep_research/report_generator.py +206 -0
  12. local_deep_research/search_system.py +241 -0
  13. local_deep_research/utilties/__init__.py +0 -0
  14. local_deep_research/utilties/enums.py +9 -0
  15. local_deep_research/utilties/llm_utils.py +116 -0
  16. local_deep_research/utilties/search_utilities.py +115 -0
  17. local_deep_research/utilties/setup_utils.py +6 -0
  18. local_deep_research/web/__init__.py +2 -0
  19. local_deep_research/web/app.py +1209 -0
  20. local_deep_research/web/static/css/styles.css +1008 -0
  21. local_deep_research/web/static/js/app.js +2078 -0
  22. local_deep_research/web/templates/api_keys_config.html +82 -0
  23. local_deep_research/web/templates/collections_config.html +90 -0
  24. local_deep_research/web/templates/index.html +312 -0
  25. local_deep_research/web/templates/llm_config.html +120 -0
  26. local_deep_research/web/templates/main_config.html +89 -0
  27. local_deep_research/web/templates/search_engines_config.html +154 -0
  28. local_deep_research/web/templates/settings.html +519 -0
  29. local_deep_research/web/templates/settings_dashboard.html +207 -0
  30. local_deep_research/web_search_engines/__init__.py +0 -0
  31. local_deep_research/web_search_engines/engines/__init__.py +0 -0
  32. local_deep_research/web_search_engines/engines/full_search.py +128 -0
  33. local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
  34. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
  35. local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
  36. local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
  37. local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
  38. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
  39. local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
  40. local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
  41. local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
  42. local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
  43. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
  44. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
  45. local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
  46. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
  47. local_deep_research/web_search_engines/full_search.py +254 -0
  48. local_deep_research/web_search_engines/search_engine_base.py +197 -0
  49. local_deep_research/web_search_engines/search_engine_factory.py +233 -0
  50. local_deep_research/web_search_engines/search_engines_config.py +54 -0
  51. local_deep_research-0.1.0.dist-info/LICENSE +21 -0
  52. local_deep_research-0.1.0.dist-info/METADATA +328 -0
  53. local_deep_research-0.1.0.dist-info/RECORD +56 -0
  54. local_deep_research-0.1.0.dist-info/WHEEL +5 -0
  55. local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
  56. local_deep_research-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,901 @@
1
+ from typing import Dict, List, Any, Optional, Tuple, Union
2
+ import os
3
+ import json
4
+ import hashlib
5
+ import time
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ import tiktoken
9
+ import logging
10
+ import re
11
+ import pickle
12
+
13
+ from langchain_core.language_models import BaseLLM
14
+ from langchain_community.document_loaders import (
15
+ PyPDFLoader,
16
+ TextLoader,
17
+ UnstructuredMarkdownLoader,
18
+ UnstructuredWordDocumentLoader,
19
+ CSVLoader,
20
+ UnstructuredExcelLoader,
21
+ DirectoryLoader
22
+ )
23
+ from langchain_community.document_loaders.base import BaseLoader
24
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
25
+ from langchain_community.vectorstores import FAISS
26
+ from langchain_community.embeddings import (
27
+ HuggingFaceEmbeddings,
28
+ OllamaEmbeddings,
29
+ SentenceTransformerEmbeddings
30
+ )
31
+
32
+ from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
33
+ from local_deep_research import config
34
+
35
+ # Setup logging
36
+ logging.basicConfig(level=logging.INFO)
37
+ logger = logging.getLogger(__name__)
38
+
39
+ class LocalEmbeddingManager:
40
+ """Handles embedding generation and storage for local document search"""
41
+
42
+ def __init__(
43
+ self,
44
+ embedding_model: str = "all-MiniLM-L6-v2",
45
+ embedding_device: str = "cpu",
46
+ embedding_model_type: str = "sentence_transformers", # or 'ollama'
47
+ ollama_base_url: Optional[str] = None,
48
+ chunk_size: int = 1000,
49
+ chunk_overlap: int = 200,
50
+ cache_dir: str = ".cache/local_search",
51
+ ):
52
+ """
53
+ Initialize the embedding manager for local document search.
54
+
55
+ Args:
56
+ embedding_model: Name of the embedding model to use
57
+ embedding_device: Device to run embeddings on ('cpu' or 'cuda')
58
+ embedding_model_type: Type of embedding model ('sentence_transformers' or 'ollama')
59
+ ollama_base_url: Base URL for Ollama API if using ollama embeddings
60
+ chunk_size: Size of text chunks for splitting documents
61
+ chunk_overlap: Overlap between chunks
62
+ cache_dir: Directory to store embedding cache and index
63
+ """
64
+
65
+ self.embedding_model = embedding_model
66
+ self.embedding_device = embedding_device
67
+ self.embedding_model_type = embedding_model_type
68
+ self.ollama_base_url = ollama_base_url
69
+ self.chunk_size = chunk_size
70
+ self.chunk_overlap = chunk_overlap
71
+ self.cache_dir = Path(cache_dir)
72
+
73
+ # Create cache directory if it doesn't exist
74
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
75
+
76
+ # Initialize the embedding model
77
+ self._embeddings = None
78
+
79
+ # Initialize the text splitter
80
+ self.text_splitter = RecursiveCharacterTextSplitter(
81
+ chunk_size=chunk_size,
82
+ chunk_overlap=chunk_overlap
83
+ )
84
+
85
+ # Track indexed folders and their metadata
86
+ self.indexed_folders = self._load_indexed_folders()
87
+
88
+ # Vector store cache
89
+ self.vector_stores = {}
90
+ @property
91
+ def embeddings(self):
92
+ """
93
+ Lazily initialize embeddings when first accessed.
94
+ This allows the LocalEmbeddingManager to be created without
95
+ immediately loading models, which is helpful when no local search is performed.
96
+ """
97
+ if self._embeddings is None:
98
+ logger.info("Initializing embeddings on first use")
99
+ self._embeddings = self._initialize_embeddings()
100
+ return self._embeddings
101
+
102
+ def _initialize_embeddings(self):
103
+ """Initialize the embedding model based on configuration"""
104
+ try:
105
+ if self.embedding_model_type == "ollama":
106
+ # Use Ollama for embeddings
107
+ if not self.ollama_base_url:
108
+ self.ollama_base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
109
+
110
+ logger.info(f"Initializing Ollama embeddings with model {self.embedding_model}")
111
+ return OllamaEmbeddings(
112
+ model=self.embedding_model,
113
+ base_url=self.ollama_base_url
114
+ )
115
+ else:
116
+ # Default: Use SentenceTransformers/HuggingFace
117
+ logger.info(f"Initializing SentenceTransformerEmbeddings with model {self.embedding_model}")
118
+ return SentenceTransformerEmbeddings(
119
+ model_name=self.embedding_model,
120
+ model_kwargs={"device": self.embedding_device}
121
+ )
122
+ except Exception as e:
123
+ logger.error(f"Error initializing embeddings: {e}")
124
+ logger.warning("Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2")
125
+ return HuggingFaceEmbeddings(
126
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
127
+ )
128
+ def _load_or_create_vector_store(self):
129
+ """Load the vector store from disk or create it if needed"""
130
+ vector_store_path = self._get_vector_store_path()
131
+
132
+ # Check if vector store exists and is up to date
133
+ if vector_store_path.exists() and not self._check_folders_modified():
134
+ logger.info(f"Loading existing vector store from {vector_store_path}")
135
+ try:
136
+ vector_store = FAISS.load_local(
137
+ str(vector_store_path),
138
+ self.embeddings,
139
+ allow_dangerous_deserialization=True
140
+ )
141
+
142
+ # Add this code to show document count
143
+ doc_count = len(vector_store.index_to_docstore_id)
144
+ logger.info(f"Loaded index with {doc_count} document chunks")
145
+
146
+ return vector_store
147
+ except Exception as e:
148
+ logger.error(f"Error loading vector store: {e}")
149
+ logger.info("Will create a new vector store")
150
+
151
+ # Create a new vector store
152
+ return self._create_vector_store()
153
+ def _load_indexed_folders(self) -> Dict[str, Dict[str, Any]]:
154
+ """Load metadata about indexed folders from disk"""
155
+ index_metadata_path = self.cache_dir / "index_metadata.json"
156
+
157
+ if index_metadata_path.exists():
158
+ try:
159
+ with open(index_metadata_path, "r") as f:
160
+ return json.load(f)
161
+ except Exception as e:
162
+ logger.error(f"Error loading index metadata: {e}")
163
+
164
+ return {}
165
+
166
+ def _save_indexed_folders(self):
167
+ """Save metadata about indexed folders to disk"""
168
+ index_metadata_path = self.cache_dir / "index_metadata.json"
169
+
170
+ try:
171
+ with open(index_metadata_path, "w") as f:
172
+ json.dump(self.indexed_folders, f, indent=2)
173
+ except Exception as e:
174
+ logger.error(f"Error saving index metadata: {e}")
175
+
176
+ def _get_folder_hash(self, folder_path: str) -> str:
177
+ """Generate a hash for a folder based on its path"""
178
+ return hashlib.md5(folder_path.encode()).hexdigest()
179
+
180
+ def _get_index_path(self, folder_path: str) -> Path:
181
+ """Get the path where the index for a specific folder should be stored"""
182
+ folder_hash = self._get_folder_hash(folder_path)
183
+ return self.cache_dir / f"index_{folder_hash}"
184
+
185
+ def _check_folder_modified(self, folder_path: str) -> bool:
186
+ """Check if a folder has been modified since it was last indexed"""
187
+ folder_path = Path(folder_path)
188
+
189
+ if not folder_path.exists() or not folder_path.is_dir():
190
+ return False
191
+
192
+ folder_hash = self._get_folder_hash(str(folder_path))
193
+
194
+ # If folder has never been indexed, it's considered modified
195
+ if folder_hash not in self.indexed_folders:
196
+ return True
197
+
198
+ last_indexed = self.indexed_folders[folder_hash].get("last_indexed", 0)
199
+
200
+ # Check if any file in the folder has been modified since last indexing
201
+ for root, _, files in os.walk(folder_path):
202
+ for file in files:
203
+ file_path = Path(root) / file
204
+ if file_path.stat().st_mtime > last_indexed:
205
+ return True
206
+
207
+ return False
208
+
209
+ def get_file_loader(self, file_path: str) -> Optional[BaseLoader]:
210
+ """Get an appropriate document loader for a file based on its extension"""
211
+ file_path = Path(file_path)
212
+ extension = file_path.suffix.lower()
213
+
214
+ try:
215
+ if extension == ".pdf":
216
+ return PyPDFLoader(str(file_path))
217
+ elif extension == ".txt":
218
+ return TextLoader(str(file_path))
219
+ elif extension in [".md", ".markdown"]:
220
+ return UnstructuredMarkdownLoader(str(file_path))
221
+ elif extension in [".doc", ".docx"]:
222
+ return UnstructuredWordDocumentLoader(str(file_path))
223
+ elif extension == ".csv":
224
+ return CSVLoader(str(file_path))
225
+ elif extension in [".xls", ".xlsx"]:
226
+ return UnstructuredExcelLoader(str(file_path))
227
+ else:
228
+ # Try the text loader as a fallback for unknown extensions
229
+ logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
230
+ return TextLoader(str(file_path), encoding="utf-8")
231
+ except Exception as e:
232
+ logger.error(f"Error creating loader for {file_path}: {e}")
233
+ return None
234
+
235
+ def index_folder(self, folder_path: str, force_reindex: bool = False) -> bool:
236
+ """
237
+ Index all documents in a folder for vector search.
238
+
239
+ Args:
240
+ folder_path: Path to the folder to index
241
+ force_reindex: Whether to force reindexing even if unchanged
242
+
243
+ Returns:
244
+ bool: True if indexing was successful, False otherwise
245
+ """
246
+ folder_path = Path(folder_path)
247
+
248
+ # Validate folder
249
+ if not folder_path.exists():
250
+ logger.error(f"Folder not found: {folder_path}")
251
+ return False
252
+
253
+ if not folder_path.is_dir():
254
+ logger.error(f"Path is not a directory: {folder_path}")
255
+ return False
256
+
257
+ folder_str = str(folder_path)
258
+ folder_hash = self._get_folder_hash(folder_str)
259
+ index_path = self._get_index_path(folder_str)
260
+
261
+ # Check if folder needs to be reindexed
262
+ if not force_reindex and not self._check_folder_modified(folder_str):
263
+ logger.info(f"Folder {folder_path} has not been modified since last indexing")
264
+
265
+ # Load the vector store from disk if not already loaded
266
+ if folder_hash not in self.vector_stores:
267
+ try:
268
+ self.vector_stores[folder_hash] = FAISS.load_local(
269
+ str(index_path),
270
+ self.embeddings,
271
+ allow_dangerous_deserialization=True
272
+ )
273
+ logger.info(f"Loaded index for {folder_path} from disk")
274
+ except Exception as e:
275
+ logger.error(f"Error loading index for {folder_path}: {e}")
276
+ # If loading fails, force reindexing
277
+ force_reindex = True
278
+ else:
279
+ logger.info(f"Using cached index for {folder_path}")
280
+
281
+ # If no reindexing is needed and vector store loaded successfully
282
+ if not force_reindex and folder_hash in self.vector_stores:
283
+ return True
284
+
285
+ logger.info(f"Indexing folder: {folder_path}")
286
+ start_time = time.time()
287
+
288
+ # Find documents to index
289
+ all_docs = []
290
+ file_count = 0
291
+ error_count = 0
292
+
293
+ for root, _, files in os.walk(folder_path):
294
+ for file in files:
295
+ file_path = Path(root) / file
296
+
297
+ # Skip hidden files and directories
298
+ if file.startswith(".") or any(part.startswith(".") for part in file_path.parts):
299
+ continue
300
+
301
+ # Get a loader for this file
302
+ loader = self.get_file_loader(str(file_path))
303
+
304
+ if loader:
305
+ try:
306
+ # Load the document
307
+ docs = loader.load()
308
+
309
+ # Add source path metadata
310
+ for doc in docs:
311
+ doc.metadata["source"] = str(file_path)
312
+ doc.metadata["filename"] = file
313
+
314
+ all_docs.extend(docs)
315
+ file_count += 1
316
+ except Exception as e:
317
+ logger.error(f"Error loading {file_path}: {e}")
318
+ error_count += 1
319
+
320
+ if not all_docs:
321
+ logger.warning(f"No documents found in {folder_path} or all documents failed to load")
322
+ return False
323
+
324
+ # Split documents into chunks
325
+ logger.info(f"Splitting {len(all_docs)} documents into chunks")
326
+ splits = self.text_splitter.split_documents(all_docs)
327
+ logger.info(f"Created {len(splits)} chunks from {file_count} files")
328
+
329
+ # Create vector store
330
+ logger.info(f"Creating vector store with {len(splits)} chunks")
331
+ vector_store = FAISS.from_documents(splits, self.embeddings)
332
+
333
+ # Save the vector store to disk
334
+ logger.info(f"Saving index to {index_path}")
335
+ vector_store.save_local(str(index_path))
336
+
337
+ # Update cache
338
+ self.vector_stores[folder_hash] = vector_store
339
+
340
+ # Update metadata
341
+ self.indexed_folders[folder_hash] = {
342
+ "path": folder_str,
343
+ "last_indexed": time.time(),
344
+ "file_count": file_count,
345
+ "chunk_count": len(splits),
346
+ "error_count": error_count,
347
+ "embedding_model": self.embedding_model,
348
+ "chunk_size": self.chunk_size,
349
+ "chunk_overlap": self.chunk_overlap
350
+ }
351
+
352
+ # Save updated metadata
353
+ self._save_indexed_folders()
354
+
355
+ elapsed_time = time.time() - start_time
356
+ logger.info(f"Indexed {file_count} files in {elapsed_time:.2f} seconds")
357
+
358
+ return True
359
+
360
+ def search(
361
+ self,
362
+ query: str,
363
+ folder_paths: List[str],
364
+ limit: int = 10,
365
+ score_threshold: float = 0.0,
366
+ ) -> List[Dict[str, Any]]:
367
+ """
368
+ Search for documents relevant to a query across specified folders.
369
+
370
+ Args:
371
+ query: The search query
372
+ folder_paths: List of folder paths to search in
373
+ limit: Maximum number of results to return
374
+ score_threshold: Minimum similarity score threshold
375
+
376
+ Returns:
377
+ List of results with document content and metadata
378
+ """
379
+ # Add detailed debugging for each folder
380
+ for folder_path in folder_paths:
381
+ folder_hash = self._get_folder_hash(folder_path)
382
+ index_path = self._get_index_path(folder_path)
383
+
384
+ logger.info(f"Diagnostic for {folder_path}:")
385
+ logger.info(f" - Folder hash: {folder_hash}")
386
+ logger.info(f" - Index path: {index_path}")
387
+ logger.info(f" - Index exists on disk: {index_path.exists()}")
388
+ logger.info(f" - Is in indexed_folders: {folder_hash in self.indexed_folders}")
389
+
390
+ if folder_hash in self.indexed_folders:
391
+ meta = self.indexed_folders[folder_hash]
392
+ logger.info(f" - Metadata: file_count={meta.get('file_count', 0)}, chunk_count={meta.get('chunk_count', 0)}")
393
+
394
+ # Validate folders exist
395
+ valid_folder_paths = []
396
+ for path in folder_paths:
397
+ if os.path.exists(path) and os.path.isdir(path):
398
+ valid_folder_paths.append(path)
399
+ else:
400
+ logger.warning(f"Skipping non-existent folder in search: {path}")
401
+
402
+ # If no valid folders, return empty results
403
+ if not valid_folder_paths:
404
+ logger.warning(f"No valid folders to search among: {folder_paths}")
405
+ return []
406
+
407
+ all_results = []
408
+
409
+ for folder_path in valid_folder_paths:
410
+ folder_hash = self._get_folder_hash(folder_path)
411
+
412
+ # Skip folders that haven't been indexed
413
+ if folder_hash not in self.indexed_folders:
414
+ logger.warning(f"Folder {folder_path} has not been indexed")
415
+ continue
416
+
417
+ # Make sure the vector store is loaded
418
+ if folder_hash not in self.vector_stores:
419
+ index_path = self._get_index_path(folder_path)
420
+ try:
421
+ self.vector_stores[folder_hash] = FAISS.load_local(
422
+ str(index_path),
423
+ self.embeddings,
424
+ allow_dangerous_deserialization=True
425
+ )
426
+ except Exception as e:
427
+ logger.error(f"Error loading index for {folder_path}: {e}")
428
+ continue
429
+
430
+ # Search in this folder
431
+ vector_store = self.vector_stores[folder_hash]
432
+
433
+ try:
434
+ docs_with_scores = vector_store.similarity_search_with_score(query, k=limit)
435
+
436
+ for doc, score in docs_with_scores:
437
+ # Convert score from distance to similarity (lower distance = higher similarity)
438
+ # FAISS cosine distance is in [0, 2], where 0 is identical and 2 is opposite
439
+ # Convert to a similarity score in [0, 1]
440
+ similarity = 1.0 - (score / 2.0)
441
+
442
+ # Skip results below the threshold
443
+ if similarity < score_threshold:
444
+ continue
445
+
446
+ result = {
447
+ "content": doc.page_content,
448
+ "metadata": doc.metadata,
449
+ "similarity": float(similarity),
450
+ "folder": folder_path
451
+ }
452
+
453
+ all_results.append(result)
454
+ except Exception as e:
455
+ logger.error(f"Error searching in {folder_path}: {e}")
456
+
457
+ # Sort by similarity (highest first)
458
+ all_results.sort(key=lambda x: x["similarity"], reverse=True)
459
+
460
+ # Limit to the requested number
461
+ return all_results[:limit]
462
+
463
+ def clear_cache(self):
464
+ """Clear all cached vector stores from memory (not disk)"""
465
+ self.vector_stores.clear()
466
+
467
+ def get_indexed_folders_info(self) -> List[Dict[str, Any]]:
468
+ """Get information about all indexed folders"""
469
+ info = []
470
+
471
+ for folder_hash, metadata in self.indexed_folders.items():
472
+ folder_info = metadata.copy()
473
+
474
+ # Add formatted last indexed time
475
+ if "last_indexed" in folder_info:
476
+ folder_info["last_indexed_formatted"] = datetime.fromtimestamp(
477
+ folder_info["last_indexed"]
478
+ ).strftime("%Y-%m-%d %H:%M:%S")
479
+
480
+ # Check if index file exists
481
+ index_path = self._get_index_path(folder_info["path"])
482
+ folder_info["index_exists"] = index_path.exists()
483
+
484
+ info.append(folder_info)
485
+
486
+ return info
487
+
488
+
489
+ class LocalSearchEngine(BaseSearchEngine):
490
+ """Local document search engine with two-phase retrieval"""
491
+
492
+ def __init__(
493
+ self,
494
+ folder_paths: List[str],
495
+ llm: Optional[BaseLLM] = None,
496
+ max_results: int = 10,
497
+ max_filtered_results: Optional[int] = None,
498
+ embedding_model: str = "all-MiniLM-L6-v2",
499
+ embedding_device: str = "cpu",
500
+ embedding_model_type: str = "sentence_transformers",
501
+ ollama_base_url: Optional[str] = None,
502
+ force_reindex: bool = False,
503
+ chunk_size: int = 1000,
504
+ chunk_overlap: int = 200,
505
+ cache_dir: str = ".cache/local_search",
506
+ collections: Optional[Dict[str, Dict[str, Any]]] = None,
507
+ ):
508
+ """
509
+ Initialize the local search engine.
510
+
511
+ Args:
512
+ folder_paths: List of folder paths to search in
513
+ llm: Language model for relevance filtering
514
+ max_results: Maximum number of results to return
515
+ max_filtered_results: Maximum results after filtering
516
+ embedding_model: Name of the embedding model to use
517
+ embedding_device: Device to run embeddings on ('cpu' or 'cuda')
518
+ embedding_model_type: Type of embedding model
519
+ ollama_base_url: Base URL for Ollama API
520
+ force_reindex: Whether to force reindexing
521
+ chunk_size: Size of text chunks for splitting documents
522
+ chunk_overlap: Overlap between chunks
523
+ cache_dir: Directory to store embedding cache and index
524
+ collections: Dictionary of named collections with paths and descriptions
525
+ """
526
+ # Initialize the base search engine
527
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results)
528
+
529
+ # Validate folder paths
530
+ self.folder_paths = folder_paths
531
+ self.valid_folder_paths = []
532
+ for path in folder_paths:
533
+ if os.path.exists(path) and os.path.isdir(path):
534
+ self.valid_folder_paths.append(path)
535
+ else:
536
+ logger.warning(f"Folder not found or is not a directory: {path}")
537
+
538
+ # If no valid folders, log a clear message
539
+ if not self.valid_folder_paths and folder_paths:
540
+ logger.warning(f"No valid folders found among: {folder_paths}")
541
+ logger.warning("This search engine will return no results until valid folders are configured")
542
+
543
+ self.max_results = max_results
544
+ self.collections = collections or {"default": {"paths": folder_paths, "description": "Default collection"}}
545
+
546
+ # Initialize the embedding manager with only valid folders
547
+ self.embedding_manager = LocalEmbeddingManager(
548
+ embedding_model=embedding_model,
549
+ embedding_device=embedding_device,
550
+ embedding_model_type=embedding_model_type,
551
+ ollama_base_url=ollama_base_url,
552
+ chunk_size=chunk_size,
553
+ chunk_overlap=chunk_overlap,
554
+ cache_dir=cache_dir
555
+ )
556
+
557
+ # Index all folders
558
+ self._index_folders(force_reindex)
559
+
560
+ def _index_folders(self, force_reindex: bool = False):
561
+ """Index all valid configured folders"""
562
+ indexed = []
563
+ failed = []
564
+ skipped = []
565
+
566
+ # Keep track of invalid folders
567
+ for folder in self.folder_paths:
568
+ if folder not in self.valid_folder_paths:
569
+ skipped.append(folder)
570
+ continue
571
+
572
+ success = self.embedding_manager.index_folder(folder, force_reindex)
573
+ if success:
574
+ indexed.append(folder)
575
+ else:
576
+ failed.append(folder)
577
+
578
+ if indexed:
579
+ logger.info(f"Successfully indexed {len(indexed)} folders: {', '.join(indexed)}")
580
+
581
+ if failed:
582
+ logger.warning(f"Failed to index {len(failed)} folders: {', '.join(failed)}")
583
+
584
+ if skipped:
585
+ logger.warning(f"Skipped {len(skipped)} invalid folders: {', '.join(skipped)}")
586
+
587
+ def _get_previews(self, query: str, collection_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
588
+ """
589
+ Get preview information for documents matching the query.
590
+
591
+ Args:
592
+ query: The search query
593
+ collection_names: Specific collections to search within (if None, search all)
594
+
595
+ Returns:
596
+ List of preview dictionaries
597
+ """
598
+ # Determine which collections to search
599
+ if collection_names:
600
+ # Search only in specified collections
601
+ collections_to_search = {name: self.collections[name] for name in collection_names
602
+ if name in self.collections}
603
+ if not collections_to_search:
604
+ logger.warning(f"No valid collections found among: {collection_names}")
605
+ return []
606
+ else:
607
+ # Search in all collections
608
+ collections_to_search = self.collections
609
+
610
+ # Extract all folder paths from the collections to search
611
+ search_paths = []
612
+ for collection_config in collections_to_search.values():
613
+ if "paths" in collection_config:
614
+ search_paths.extend(collection_config["paths"])
615
+
616
+ logger.info(f"Searching local documents in collections: {list(collections_to_search.keys())}")
617
+
618
+ # Filter out invalid paths
619
+ valid_search_paths = [path for path in search_paths if path in self.valid_folder_paths]
620
+
621
+ if not valid_search_paths:
622
+ logger.warning(f"No valid folders to search in collections: {list(collections_to_search.keys())}")
623
+ return []
624
+
625
+ # Search across the valid selected folders
626
+ raw_results = self.embedding_manager.search(
627
+ query=query,
628
+ folder_paths=valid_search_paths,
629
+ limit=self.max_results,
630
+ score_threshold=0.1 # Skip very low relevance results
631
+ )
632
+
633
+ if not raw_results:
634
+ logger.info(f"No local documents found for query: {query}")
635
+ return []
636
+
637
+ # Convert to preview format
638
+ previews = []
639
+ for i, result in enumerate(raw_results):
640
+ # Create a unique ID
641
+ result_id = f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
642
+
643
+ # Extract filename and path
644
+ source_path = result['metadata'].get('source', 'Unknown')
645
+ filename = result['metadata'].get('filename', os.path.basename(source_path))
646
+
647
+ # Create preview snippet (first ~200 chars of content)
648
+ snippet = result['content'][:200] + "..." if len(result['content']) > 200 else result['content']
649
+
650
+ # Determine which collection this document belongs to
651
+ collection_name = "Unknown"
652
+ folder_path = result['folder']
653
+ for name, collection in self.collections.items():
654
+ if any(folder_path.startswith(path) for path in collection.get("paths", [])):
655
+ collection_name = name
656
+ break
657
+
658
+ # Format the preview
659
+ preview = {
660
+ "id": result_id,
661
+ "title": filename,
662
+ "snippet": snippet,
663
+ "link": source_path,
664
+ "similarity": result['similarity'],
665
+ "folder": folder_path,
666
+ "collection": collection_name,
667
+ "collection_description": self.collections.get(collection_name, {}).get("description", ""),
668
+ "_full_content": result['content'], # Store full content for later
669
+ "_metadata": result['metadata'] # Store metadata for later
670
+ }
671
+
672
+ previews.append(preview)
673
+
674
+ logger.info(f"Found {len(previews)} local document matches")
675
+ return previews
676
+
677
+ def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
678
+ """
679
+ Get full content for the relevant documents.
680
+ For local search, the full content is already available.
681
+
682
+ Args:
683
+ relevant_items: List of relevant preview dictionaries
684
+
685
+ Returns:
686
+ List of result dictionaries with full content
687
+ """
688
+ # Check if we should add full content
689
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
690
+ logger.info("Snippet-only mode, skipping full content addition")
691
+ return relevant_items
692
+
693
+ # For local search, we already have the full content
694
+ results = []
695
+ for item in relevant_items:
696
+ # Create a copy with full content
697
+ result = item.copy()
698
+
699
+ # Add full content if we have it
700
+ if "_full_content" in item:
701
+ result["content"] = item["_full_content"]
702
+ result["full_content"] = item["_full_content"]
703
+
704
+ # Remove temporary fields
705
+ if "_full_content" in result:
706
+ del result["_full_content"]
707
+
708
+ # Add metadata if we have it
709
+ if "_metadata" in item:
710
+ result["document_metadata"] = item["_metadata"]
711
+
712
+ # Remove temporary fields
713
+ if "_metadata" in result:
714
+ del result["_metadata"]
715
+
716
+ results.append(result)
717
+
718
+ return results
719
+
720
+ def run(self, query: str, collection_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
721
+ """
722
+ Execute a search using the two-phase approach.
723
+
724
+ Args:
725
+ query: The search query
726
+ collection_names: Specific collections to search within (if None, search all)
727
+
728
+ Returns:
729
+ List of search result dictionaries with full content
730
+ """
731
+ logger.info(f"---Execute a search using Local Documents---")
732
+
733
+ # Check if we have any special collection parameters in the query
734
+ collection_prefix = "collection:"
735
+ remaining_query = query
736
+ specified_collections = []
737
+
738
+ # Parse query for collection specifications like "collection:research_papers query terms"
739
+ query_parts = query.split()
740
+ for part in query_parts:
741
+ if part.lower().startswith(collection_prefix):
742
+ collection_name = part[len(collection_prefix):].strip()
743
+ if collection_name in self.collections:
744
+ specified_collections.append(collection_name)
745
+ # Remove this part from the query
746
+ remaining_query = remaining_query.replace(part, "", 1).strip()
747
+
748
+ # If collections were specified in the query, they override the parameter
749
+ if specified_collections:
750
+ collection_names = specified_collections
751
+ query = remaining_query
752
+
753
+ # Phase 1: Get previews (with collection filtering)
754
+ previews = self._get_previews(query, collection_names)
755
+
756
+ if not previews:
757
+ return []
758
+
759
+ # Phase 2: Filter for relevance
760
+ relevant_items = self._filter_for_relevance(previews, query)
761
+
762
+ if not relevant_items:
763
+ return []
764
+
765
+ # Phase 3: Get full content for relevant items
766
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
767
+ print("Returning snippet-only results as per config")
768
+ results = relevant_items
769
+ else:
770
+ results = self._get_full_content(relevant_items)
771
+
772
+ # Clean up temporary data
773
+ self.embedding_manager.clear_cache()
774
+
775
+ return results
776
+
777
+ def get_collections_info(self) -> List[Dict[str, Any]]:
778
+ """
779
+ Get information about all collections, including indexing status.
780
+
781
+ Returns:
782
+ List of collection information dictionaries
783
+ """
784
+ collections_info = []
785
+
786
+ for name, collection in self.collections.items():
787
+ paths = collection.get("paths", [])
788
+ description = collection.get("description", "")
789
+
790
+ # Get indexing information for each path
791
+ paths_info = []
792
+ for path in paths:
793
+ # Check if folder exists
794
+ exists = os.path.exists(path) and os.path.isdir(path)
795
+
796
+ # Check if folder is indexed
797
+ folder_hash = self.embedding_manager._get_folder_hash(path)
798
+ indexed = folder_hash in self.embedding_manager.indexed_folders
799
+
800
+ # Get index details if available
801
+ index_info = {}
802
+ if indexed:
803
+ index_info = self.embedding_manager.indexed_folders[folder_hash].copy()
804
+
805
+ paths_info.append({
806
+ "path": path,
807
+ "exists": exists,
808
+ "indexed": indexed,
809
+ "index_info": index_info
810
+ })
811
+
812
+ collections_info.append({
813
+ "name": name,
814
+ "description": description,
815
+ "paths": paths,
816
+ "paths_info": paths_info,
817
+ "document_count": sum(info.get("index_info", {}).get("file_count", 0) for info in paths_info),
818
+ "chunk_count": sum(info.get("index_info", {}).get("chunk_count", 0) for info in paths_info),
819
+ "all_indexed": all(info["indexed"] for info in paths_info if info["exists"])
820
+ })
821
+
822
+ return collections_info
823
+
824
+ def reindex_collection(self, collection_name: str) -> bool:
825
+ """
826
+ Reindex a specific collection.
827
+
828
+ Args:
829
+ collection_name: Name of the collection to reindex
830
+
831
+ Returns:
832
+ True if reindexing was successful, False otherwise
833
+ """
834
+ if collection_name not in self.collections:
835
+ logger.error(f"Collection '{collection_name}' not found")
836
+ return False
837
+
838
+ paths = self.collections[collection_name].get("paths", [])
839
+ success = True
840
+
841
+ for path in paths:
842
+ if not self.embedding_manager.index_folder(path, force_reindex=True):
843
+ success = False
844
+
845
+ return success
846
+
847
+ @classmethod
848
+ def from_config(cls, config_dict: Dict[str, Any], llm: Optional[BaseLLM] = None) -> "LocalSearchEngine":
849
+ """
850
+ Create a LocalSearchEngine instance from a configuration dictionary.
851
+
852
+ Args:
853
+ config_dict: Configuration dictionary
854
+ llm: Language model for relevance filtering
855
+
856
+ Returns:
857
+ Initialized LocalSearchEngine instance
858
+ """
859
+ # Required parameters
860
+ folder_paths = []
861
+ collections = config_dict.get("collections", {})
862
+
863
+ # Extract all folder paths from collections
864
+ for collection_config in collections.values():
865
+ if "paths" in collection_config:
866
+ folder_paths.extend(collection_config["paths"])
867
+
868
+ # Fall back to folder_paths if no collections defined
869
+ if not folder_paths:
870
+ folder_paths = config_dict.get("folder_paths", [])
871
+ # Create a default collection if using folder_paths
872
+ if folder_paths:
873
+ collections = {"default": {"paths": folder_paths, "description": "Default collection"}}
874
+
875
+ # Optional parameters with defaults
876
+ max_results = config_dict.get("max_results", 10)
877
+ max_filtered_results = config_dict.get("max_filtered_results")
878
+ embedding_model = config_dict.get("embedding_model", "all-MiniLM-L6-v2")
879
+ embedding_device = config_dict.get("embedding_device", "cpu")
880
+ embedding_model_type = config_dict.get("embedding_model_type", "sentence_transformers")
881
+ ollama_base_url = config_dict.get("ollama_base_url")
882
+ force_reindex = config_dict.get("force_reindex", False)
883
+ chunk_size = config_dict.get("chunk_size", 1000)
884
+ chunk_overlap = config_dict.get("chunk_overlap", 200)
885
+ cache_dir = config_dict.get("cache_dir", ".cache/local_search")
886
+
887
+ return cls(
888
+ folder_paths=folder_paths,
889
+ collections=collections,
890
+ llm=llm,
891
+ max_results=max_results,
892
+ max_filtered_results=max_filtered_results,
893
+ embedding_model=embedding_model,
894
+ embedding_device=embedding_device,
895
+ embedding_model_type=embedding_model_type,
896
+ ollama_base_url=ollama_base_url,
897
+ force_reindex=force_reindex,
898
+ chunk_size=chunk_size,
899
+ chunk_overlap=chunk_overlap,
900
+ cache_dir=cache_dir
901
+ )