local-deep-research 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +24 -0
- local_deep_research/citation_handler.py +113 -0
- local_deep_research/config.py +166 -0
- local_deep_research/defaults/__init__.py +44 -0
- local_deep_research/defaults/llm_config.py +269 -0
- local_deep_research/defaults/local_collections.toml +47 -0
- local_deep_research/defaults/main.toml +57 -0
- local_deep_research/defaults/search_engines.toml +244 -0
- local_deep_research/local_collections.py +141 -0
- local_deep_research/main.py +113 -0
- local_deep_research/report_generator.py +206 -0
- local_deep_research/search_system.py +241 -0
- local_deep_research/utilties/__init__.py +0 -0
- local_deep_research/utilties/enums.py +9 -0
- local_deep_research/utilties/llm_utils.py +116 -0
- local_deep_research/utilties/search_utilities.py +115 -0
- local_deep_research/utilties/setup_utils.py +6 -0
- local_deep_research/web/__init__.py +2 -0
- local_deep_research/web/app.py +1209 -0
- local_deep_research/web/static/css/styles.css +1008 -0
- local_deep_research/web/static/js/app.js +2078 -0
- local_deep_research/web/templates/api_keys_config.html +82 -0
- local_deep_research/web/templates/collections_config.html +90 -0
- local_deep_research/web/templates/index.html +312 -0
- local_deep_research/web/templates/llm_config.html +120 -0
- local_deep_research/web/templates/main_config.html +89 -0
- local_deep_research/web/templates/search_engines_config.html +154 -0
- local_deep_research/web/templates/settings.html +519 -0
- local_deep_research/web/templates/settings_dashboard.html +207 -0
- local_deep_research/web_search_engines/__init__.py +0 -0
- local_deep_research/web_search_engines/engines/__init__.py +0 -0
- local_deep_research/web_search_engines/engines/full_search.py +128 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
- local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
- local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
- local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
- local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
- local_deep_research/web_search_engines/full_search.py +254 -0
- local_deep_research/web_search_engines/search_engine_base.py +197 -0
- local_deep_research/web_search_engines/search_engine_factory.py +233 -0
- local_deep_research/web_search_engines/search_engines_config.py +54 -0
- local_deep_research-0.1.0.dist-info/LICENSE +21 -0
- local_deep_research-0.1.0.dist-info/METADATA +328 -0
- local_deep_research-0.1.0.dist-info/RECORD +56 -0
- local_deep_research-0.1.0.dist-info/WHEEL +5 -0
- local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
- local_deep_research-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,901 @@
|
|
1
|
+
from typing import Dict, List, Any, Optional, Tuple, Union
|
2
|
+
import os
|
3
|
+
import json
|
4
|
+
import hashlib
|
5
|
+
import time
|
6
|
+
from datetime import datetime
|
7
|
+
from pathlib import Path
|
8
|
+
import tiktoken
|
9
|
+
import logging
|
10
|
+
import re
|
11
|
+
import pickle
|
12
|
+
|
13
|
+
from langchain_core.language_models import BaseLLM
|
14
|
+
from langchain_community.document_loaders import (
|
15
|
+
PyPDFLoader,
|
16
|
+
TextLoader,
|
17
|
+
UnstructuredMarkdownLoader,
|
18
|
+
UnstructuredWordDocumentLoader,
|
19
|
+
CSVLoader,
|
20
|
+
UnstructuredExcelLoader,
|
21
|
+
DirectoryLoader
|
22
|
+
)
|
23
|
+
from langchain_community.document_loaders.base import BaseLoader
|
24
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
25
|
+
from langchain_community.vectorstores import FAISS
|
26
|
+
from langchain_community.embeddings import (
|
27
|
+
HuggingFaceEmbeddings,
|
28
|
+
OllamaEmbeddings,
|
29
|
+
SentenceTransformerEmbeddings
|
30
|
+
)
|
31
|
+
|
32
|
+
from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
|
33
|
+
from local_deep_research import config
|
34
|
+
|
35
|
+
# Setup logging
|
36
|
+
logging.basicConfig(level=logging.INFO)
|
37
|
+
logger = logging.getLogger(__name__)
|
38
|
+
|
39
|
+
class LocalEmbeddingManager:
|
40
|
+
"""Handles embedding generation and storage for local document search"""
|
41
|
+
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
embedding_model: str = "all-MiniLM-L6-v2",
|
45
|
+
embedding_device: str = "cpu",
|
46
|
+
embedding_model_type: str = "sentence_transformers", # or 'ollama'
|
47
|
+
ollama_base_url: Optional[str] = None,
|
48
|
+
chunk_size: int = 1000,
|
49
|
+
chunk_overlap: int = 200,
|
50
|
+
cache_dir: str = ".cache/local_search",
|
51
|
+
):
|
52
|
+
"""
|
53
|
+
Initialize the embedding manager for local document search.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
embedding_model: Name of the embedding model to use
|
57
|
+
embedding_device: Device to run embeddings on ('cpu' or 'cuda')
|
58
|
+
embedding_model_type: Type of embedding model ('sentence_transformers' or 'ollama')
|
59
|
+
ollama_base_url: Base URL for Ollama API if using ollama embeddings
|
60
|
+
chunk_size: Size of text chunks for splitting documents
|
61
|
+
chunk_overlap: Overlap between chunks
|
62
|
+
cache_dir: Directory to store embedding cache and index
|
63
|
+
"""
|
64
|
+
|
65
|
+
self.embedding_model = embedding_model
|
66
|
+
self.embedding_device = embedding_device
|
67
|
+
self.embedding_model_type = embedding_model_type
|
68
|
+
self.ollama_base_url = ollama_base_url
|
69
|
+
self.chunk_size = chunk_size
|
70
|
+
self.chunk_overlap = chunk_overlap
|
71
|
+
self.cache_dir = Path(cache_dir)
|
72
|
+
|
73
|
+
# Create cache directory if it doesn't exist
|
74
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
75
|
+
|
76
|
+
# Initialize the embedding model
|
77
|
+
self._embeddings = None
|
78
|
+
|
79
|
+
# Initialize the text splitter
|
80
|
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
81
|
+
chunk_size=chunk_size,
|
82
|
+
chunk_overlap=chunk_overlap
|
83
|
+
)
|
84
|
+
|
85
|
+
# Track indexed folders and their metadata
|
86
|
+
self.indexed_folders = self._load_indexed_folders()
|
87
|
+
|
88
|
+
# Vector store cache
|
89
|
+
self.vector_stores = {}
|
90
|
+
@property
|
91
|
+
def embeddings(self):
|
92
|
+
"""
|
93
|
+
Lazily initialize embeddings when first accessed.
|
94
|
+
This allows the LocalEmbeddingManager to be created without
|
95
|
+
immediately loading models, which is helpful when no local search is performed.
|
96
|
+
"""
|
97
|
+
if self._embeddings is None:
|
98
|
+
logger.info("Initializing embeddings on first use")
|
99
|
+
self._embeddings = self._initialize_embeddings()
|
100
|
+
return self._embeddings
|
101
|
+
|
102
|
+
def _initialize_embeddings(self):
|
103
|
+
"""Initialize the embedding model based on configuration"""
|
104
|
+
try:
|
105
|
+
if self.embedding_model_type == "ollama":
|
106
|
+
# Use Ollama for embeddings
|
107
|
+
if not self.ollama_base_url:
|
108
|
+
self.ollama_base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
109
|
+
|
110
|
+
logger.info(f"Initializing Ollama embeddings with model {self.embedding_model}")
|
111
|
+
return OllamaEmbeddings(
|
112
|
+
model=self.embedding_model,
|
113
|
+
base_url=self.ollama_base_url
|
114
|
+
)
|
115
|
+
else:
|
116
|
+
# Default: Use SentenceTransformers/HuggingFace
|
117
|
+
logger.info(f"Initializing SentenceTransformerEmbeddings with model {self.embedding_model}")
|
118
|
+
return SentenceTransformerEmbeddings(
|
119
|
+
model_name=self.embedding_model,
|
120
|
+
model_kwargs={"device": self.embedding_device}
|
121
|
+
)
|
122
|
+
except Exception as e:
|
123
|
+
logger.error(f"Error initializing embeddings: {e}")
|
124
|
+
logger.warning("Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2")
|
125
|
+
return HuggingFaceEmbeddings(
|
126
|
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
127
|
+
)
|
128
|
+
def _load_or_create_vector_store(self):
|
129
|
+
"""Load the vector store from disk or create it if needed"""
|
130
|
+
vector_store_path = self._get_vector_store_path()
|
131
|
+
|
132
|
+
# Check if vector store exists and is up to date
|
133
|
+
if vector_store_path.exists() and not self._check_folders_modified():
|
134
|
+
logger.info(f"Loading existing vector store from {vector_store_path}")
|
135
|
+
try:
|
136
|
+
vector_store = FAISS.load_local(
|
137
|
+
str(vector_store_path),
|
138
|
+
self.embeddings,
|
139
|
+
allow_dangerous_deserialization=True
|
140
|
+
)
|
141
|
+
|
142
|
+
# Add this code to show document count
|
143
|
+
doc_count = len(vector_store.index_to_docstore_id)
|
144
|
+
logger.info(f"Loaded index with {doc_count} document chunks")
|
145
|
+
|
146
|
+
return vector_store
|
147
|
+
except Exception as e:
|
148
|
+
logger.error(f"Error loading vector store: {e}")
|
149
|
+
logger.info("Will create a new vector store")
|
150
|
+
|
151
|
+
# Create a new vector store
|
152
|
+
return self._create_vector_store()
|
153
|
+
def _load_indexed_folders(self) -> Dict[str, Dict[str, Any]]:
|
154
|
+
"""Load metadata about indexed folders from disk"""
|
155
|
+
index_metadata_path = self.cache_dir / "index_metadata.json"
|
156
|
+
|
157
|
+
if index_metadata_path.exists():
|
158
|
+
try:
|
159
|
+
with open(index_metadata_path, "r") as f:
|
160
|
+
return json.load(f)
|
161
|
+
except Exception as e:
|
162
|
+
logger.error(f"Error loading index metadata: {e}")
|
163
|
+
|
164
|
+
return {}
|
165
|
+
|
166
|
+
def _save_indexed_folders(self):
|
167
|
+
"""Save metadata about indexed folders to disk"""
|
168
|
+
index_metadata_path = self.cache_dir / "index_metadata.json"
|
169
|
+
|
170
|
+
try:
|
171
|
+
with open(index_metadata_path, "w") as f:
|
172
|
+
json.dump(self.indexed_folders, f, indent=2)
|
173
|
+
except Exception as e:
|
174
|
+
logger.error(f"Error saving index metadata: {e}")
|
175
|
+
|
176
|
+
def _get_folder_hash(self, folder_path: str) -> str:
|
177
|
+
"""Generate a hash for a folder based on its path"""
|
178
|
+
return hashlib.md5(folder_path.encode()).hexdigest()
|
179
|
+
|
180
|
+
def _get_index_path(self, folder_path: str) -> Path:
|
181
|
+
"""Get the path where the index for a specific folder should be stored"""
|
182
|
+
folder_hash = self._get_folder_hash(folder_path)
|
183
|
+
return self.cache_dir / f"index_{folder_hash}"
|
184
|
+
|
185
|
+
def _check_folder_modified(self, folder_path: str) -> bool:
|
186
|
+
"""Check if a folder has been modified since it was last indexed"""
|
187
|
+
folder_path = Path(folder_path)
|
188
|
+
|
189
|
+
if not folder_path.exists() or not folder_path.is_dir():
|
190
|
+
return False
|
191
|
+
|
192
|
+
folder_hash = self._get_folder_hash(str(folder_path))
|
193
|
+
|
194
|
+
# If folder has never been indexed, it's considered modified
|
195
|
+
if folder_hash not in self.indexed_folders:
|
196
|
+
return True
|
197
|
+
|
198
|
+
last_indexed = self.indexed_folders[folder_hash].get("last_indexed", 0)
|
199
|
+
|
200
|
+
# Check if any file in the folder has been modified since last indexing
|
201
|
+
for root, _, files in os.walk(folder_path):
|
202
|
+
for file in files:
|
203
|
+
file_path = Path(root) / file
|
204
|
+
if file_path.stat().st_mtime > last_indexed:
|
205
|
+
return True
|
206
|
+
|
207
|
+
return False
|
208
|
+
|
209
|
+
def get_file_loader(self, file_path: str) -> Optional[BaseLoader]:
|
210
|
+
"""Get an appropriate document loader for a file based on its extension"""
|
211
|
+
file_path = Path(file_path)
|
212
|
+
extension = file_path.suffix.lower()
|
213
|
+
|
214
|
+
try:
|
215
|
+
if extension == ".pdf":
|
216
|
+
return PyPDFLoader(str(file_path))
|
217
|
+
elif extension == ".txt":
|
218
|
+
return TextLoader(str(file_path))
|
219
|
+
elif extension in [".md", ".markdown"]:
|
220
|
+
return UnstructuredMarkdownLoader(str(file_path))
|
221
|
+
elif extension in [".doc", ".docx"]:
|
222
|
+
return UnstructuredWordDocumentLoader(str(file_path))
|
223
|
+
elif extension == ".csv":
|
224
|
+
return CSVLoader(str(file_path))
|
225
|
+
elif extension in [".xls", ".xlsx"]:
|
226
|
+
return UnstructuredExcelLoader(str(file_path))
|
227
|
+
else:
|
228
|
+
# Try the text loader as a fallback for unknown extensions
|
229
|
+
logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
|
230
|
+
return TextLoader(str(file_path), encoding="utf-8")
|
231
|
+
except Exception as e:
|
232
|
+
logger.error(f"Error creating loader for {file_path}: {e}")
|
233
|
+
return None
|
234
|
+
|
235
|
+
def index_folder(self, folder_path: str, force_reindex: bool = False) -> bool:
|
236
|
+
"""
|
237
|
+
Index all documents in a folder for vector search.
|
238
|
+
|
239
|
+
Args:
|
240
|
+
folder_path: Path to the folder to index
|
241
|
+
force_reindex: Whether to force reindexing even if unchanged
|
242
|
+
|
243
|
+
Returns:
|
244
|
+
bool: True if indexing was successful, False otherwise
|
245
|
+
"""
|
246
|
+
folder_path = Path(folder_path)
|
247
|
+
|
248
|
+
# Validate folder
|
249
|
+
if not folder_path.exists():
|
250
|
+
logger.error(f"Folder not found: {folder_path}")
|
251
|
+
return False
|
252
|
+
|
253
|
+
if not folder_path.is_dir():
|
254
|
+
logger.error(f"Path is not a directory: {folder_path}")
|
255
|
+
return False
|
256
|
+
|
257
|
+
folder_str = str(folder_path)
|
258
|
+
folder_hash = self._get_folder_hash(folder_str)
|
259
|
+
index_path = self._get_index_path(folder_str)
|
260
|
+
|
261
|
+
# Check if folder needs to be reindexed
|
262
|
+
if not force_reindex and not self._check_folder_modified(folder_str):
|
263
|
+
logger.info(f"Folder {folder_path} has not been modified since last indexing")
|
264
|
+
|
265
|
+
# Load the vector store from disk if not already loaded
|
266
|
+
if folder_hash not in self.vector_stores:
|
267
|
+
try:
|
268
|
+
self.vector_stores[folder_hash] = FAISS.load_local(
|
269
|
+
str(index_path),
|
270
|
+
self.embeddings,
|
271
|
+
allow_dangerous_deserialization=True
|
272
|
+
)
|
273
|
+
logger.info(f"Loaded index for {folder_path} from disk")
|
274
|
+
except Exception as e:
|
275
|
+
logger.error(f"Error loading index for {folder_path}: {e}")
|
276
|
+
# If loading fails, force reindexing
|
277
|
+
force_reindex = True
|
278
|
+
else:
|
279
|
+
logger.info(f"Using cached index for {folder_path}")
|
280
|
+
|
281
|
+
# If no reindexing is needed and vector store loaded successfully
|
282
|
+
if not force_reindex and folder_hash in self.vector_stores:
|
283
|
+
return True
|
284
|
+
|
285
|
+
logger.info(f"Indexing folder: {folder_path}")
|
286
|
+
start_time = time.time()
|
287
|
+
|
288
|
+
# Find documents to index
|
289
|
+
all_docs = []
|
290
|
+
file_count = 0
|
291
|
+
error_count = 0
|
292
|
+
|
293
|
+
for root, _, files in os.walk(folder_path):
|
294
|
+
for file in files:
|
295
|
+
file_path = Path(root) / file
|
296
|
+
|
297
|
+
# Skip hidden files and directories
|
298
|
+
if file.startswith(".") or any(part.startswith(".") for part in file_path.parts):
|
299
|
+
continue
|
300
|
+
|
301
|
+
# Get a loader for this file
|
302
|
+
loader = self.get_file_loader(str(file_path))
|
303
|
+
|
304
|
+
if loader:
|
305
|
+
try:
|
306
|
+
# Load the document
|
307
|
+
docs = loader.load()
|
308
|
+
|
309
|
+
# Add source path metadata
|
310
|
+
for doc in docs:
|
311
|
+
doc.metadata["source"] = str(file_path)
|
312
|
+
doc.metadata["filename"] = file
|
313
|
+
|
314
|
+
all_docs.extend(docs)
|
315
|
+
file_count += 1
|
316
|
+
except Exception as e:
|
317
|
+
logger.error(f"Error loading {file_path}: {e}")
|
318
|
+
error_count += 1
|
319
|
+
|
320
|
+
if not all_docs:
|
321
|
+
logger.warning(f"No documents found in {folder_path} or all documents failed to load")
|
322
|
+
return False
|
323
|
+
|
324
|
+
# Split documents into chunks
|
325
|
+
logger.info(f"Splitting {len(all_docs)} documents into chunks")
|
326
|
+
splits = self.text_splitter.split_documents(all_docs)
|
327
|
+
logger.info(f"Created {len(splits)} chunks from {file_count} files")
|
328
|
+
|
329
|
+
# Create vector store
|
330
|
+
logger.info(f"Creating vector store with {len(splits)} chunks")
|
331
|
+
vector_store = FAISS.from_documents(splits, self.embeddings)
|
332
|
+
|
333
|
+
# Save the vector store to disk
|
334
|
+
logger.info(f"Saving index to {index_path}")
|
335
|
+
vector_store.save_local(str(index_path))
|
336
|
+
|
337
|
+
# Update cache
|
338
|
+
self.vector_stores[folder_hash] = vector_store
|
339
|
+
|
340
|
+
# Update metadata
|
341
|
+
self.indexed_folders[folder_hash] = {
|
342
|
+
"path": folder_str,
|
343
|
+
"last_indexed": time.time(),
|
344
|
+
"file_count": file_count,
|
345
|
+
"chunk_count": len(splits),
|
346
|
+
"error_count": error_count,
|
347
|
+
"embedding_model": self.embedding_model,
|
348
|
+
"chunk_size": self.chunk_size,
|
349
|
+
"chunk_overlap": self.chunk_overlap
|
350
|
+
}
|
351
|
+
|
352
|
+
# Save updated metadata
|
353
|
+
self._save_indexed_folders()
|
354
|
+
|
355
|
+
elapsed_time = time.time() - start_time
|
356
|
+
logger.info(f"Indexed {file_count} files in {elapsed_time:.2f} seconds")
|
357
|
+
|
358
|
+
return True
|
359
|
+
|
360
|
+
def search(
|
361
|
+
self,
|
362
|
+
query: str,
|
363
|
+
folder_paths: List[str],
|
364
|
+
limit: int = 10,
|
365
|
+
score_threshold: float = 0.0,
|
366
|
+
) -> List[Dict[str, Any]]:
|
367
|
+
"""
|
368
|
+
Search for documents relevant to a query across specified folders.
|
369
|
+
|
370
|
+
Args:
|
371
|
+
query: The search query
|
372
|
+
folder_paths: List of folder paths to search in
|
373
|
+
limit: Maximum number of results to return
|
374
|
+
score_threshold: Minimum similarity score threshold
|
375
|
+
|
376
|
+
Returns:
|
377
|
+
List of results with document content and metadata
|
378
|
+
"""
|
379
|
+
# Add detailed debugging for each folder
|
380
|
+
for folder_path in folder_paths:
|
381
|
+
folder_hash = self._get_folder_hash(folder_path)
|
382
|
+
index_path = self._get_index_path(folder_path)
|
383
|
+
|
384
|
+
logger.info(f"Diagnostic for {folder_path}:")
|
385
|
+
logger.info(f" - Folder hash: {folder_hash}")
|
386
|
+
logger.info(f" - Index path: {index_path}")
|
387
|
+
logger.info(f" - Index exists on disk: {index_path.exists()}")
|
388
|
+
logger.info(f" - Is in indexed_folders: {folder_hash in self.indexed_folders}")
|
389
|
+
|
390
|
+
if folder_hash in self.indexed_folders:
|
391
|
+
meta = self.indexed_folders[folder_hash]
|
392
|
+
logger.info(f" - Metadata: file_count={meta.get('file_count', 0)}, chunk_count={meta.get('chunk_count', 0)}")
|
393
|
+
|
394
|
+
# Validate folders exist
|
395
|
+
valid_folder_paths = []
|
396
|
+
for path in folder_paths:
|
397
|
+
if os.path.exists(path) and os.path.isdir(path):
|
398
|
+
valid_folder_paths.append(path)
|
399
|
+
else:
|
400
|
+
logger.warning(f"Skipping non-existent folder in search: {path}")
|
401
|
+
|
402
|
+
# If no valid folders, return empty results
|
403
|
+
if not valid_folder_paths:
|
404
|
+
logger.warning(f"No valid folders to search among: {folder_paths}")
|
405
|
+
return []
|
406
|
+
|
407
|
+
all_results = []
|
408
|
+
|
409
|
+
for folder_path in valid_folder_paths:
|
410
|
+
folder_hash = self._get_folder_hash(folder_path)
|
411
|
+
|
412
|
+
# Skip folders that haven't been indexed
|
413
|
+
if folder_hash not in self.indexed_folders:
|
414
|
+
logger.warning(f"Folder {folder_path} has not been indexed")
|
415
|
+
continue
|
416
|
+
|
417
|
+
# Make sure the vector store is loaded
|
418
|
+
if folder_hash not in self.vector_stores:
|
419
|
+
index_path = self._get_index_path(folder_path)
|
420
|
+
try:
|
421
|
+
self.vector_stores[folder_hash] = FAISS.load_local(
|
422
|
+
str(index_path),
|
423
|
+
self.embeddings,
|
424
|
+
allow_dangerous_deserialization=True
|
425
|
+
)
|
426
|
+
except Exception as e:
|
427
|
+
logger.error(f"Error loading index for {folder_path}: {e}")
|
428
|
+
continue
|
429
|
+
|
430
|
+
# Search in this folder
|
431
|
+
vector_store = self.vector_stores[folder_hash]
|
432
|
+
|
433
|
+
try:
|
434
|
+
docs_with_scores = vector_store.similarity_search_with_score(query, k=limit)
|
435
|
+
|
436
|
+
for doc, score in docs_with_scores:
|
437
|
+
# Convert score from distance to similarity (lower distance = higher similarity)
|
438
|
+
# FAISS cosine distance is in [0, 2], where 0 is identical and 2 is opposite
|
439
|
+
# Convert to a similarity score in [0, 1]
|
440
|
+
similarity = 1.0 - (score / 2.0)
|
441
|
+
|
442
|
+
# Skip results below the threshold
|
443
|
+
if similarity < score_threshold:
|
444
|
+
continue
|
445
|
+
|
446
|
+
result = {
|
447
|
+
"content": doc.page_content,
|
448
|
+
"metadata": doc.metadata,
|
449
|
+
"similarity": float(similarity),
|
450
|
+
"folder": folder_path
|
451
|
+
}
|
452
|
+
|
453
|
+
all_results.append(result)
|
454
|
+
except Exception as e:
|
455
|
+
logger.error(f"Error searching in {folder_path}: {e}")
|
456
|
+
|
457
|
+
# Sort by similarity (highest first)
|
458
|
+
all_results.sort(key=lambda x: x["similarity"], reverse=True)
|
459
|
+
|
460
|
+
# Limit to the requested number
|
461
|
+
return all_results[:limit]
|
462
|
+
|
463
|
+
def clear_cache(self):
|
464
|
+
"""Clear all cached vector stores from memory (not disk)"""
|
465
|
+
self.vector_stores.clear()
|
466
|
+
|
467
|
+
def get_indexed_folders_info(self) -> List[Dict[str, Any]]:
|
468
|
+
"""Get information about all indexed folders"""
|
469
|
+
info = []
|
470
|
+
|
471
|
+
for folder_hash, metadata in self.indexed_folders.items():
|
472
|
+
folder_info = metadata.copy()
|
473
|
+
|
474
|
+
# Add formatted last indexed time
|
475
|
+
if "last_indexed" in folder_info:
|
476
|
+
folder_info["last_indexed_formatted"] = datetime.fromtimestamp(
|
477
|
+
folder_info["last_indexed"]
|
478
|
+
).strftime("%Y-%m-%d %H:%M:%S")
|
479
|
+
|
480
|
+
# Check if index file exists
|
481
|
+
index_path = self._get_index_path(folder_info["path"])
|
482
|
+
folder_info["index_exists"] = index_path.exists()
|
483
|
+
|
484
|
+
info.append(folder_info)
|
485
|
+
|
486
|
+
return info
|
487
|
+
|
488
|
+
|
489
|
+
class LocalSearchEngine(BaseSearchEngine):
|
490
|
+
"""Local document search engine with two-phase retrieval"""
|
491
|
+
|
492
|
+
def __init__(
|
493
|
+
self,
|
494
|
+
folder_paths: List[str],
|
495
|
+
llm: Optional[BaseLLM] = None,
|
496
|
+
max_results: int = 10,
|
497
|
+
max_filtered_results: Optional[int] = None,
|
498
|
+
embedding_model: str = "all-MiniLM-L6-v2",
|
499
|
+
embedding_device: str = "cpu",
|
500
|
+
embedding_model_type: str = "sentence_transformers",
|
501
|
+
ollama_base_url: Optional[str] = None,
|
502
|
+
force_reindex: bool = False,
|
503
|
+
chunk_size: int = 1000,
|
504
|
+
chunk_overlap: int = 200,
|
505
|
+
cache_dir: str = ".cache/local_search",
|
506
|
+
collections: Optional[Dict[str, Dict[str, Any]]] = None,
|
507
|
+
):
|
508
|
+
"""
|
509
|
+
Initialize the local search engine.
|
510
|
+
|
511
|
+
Args:
|
512
|
+
folder_paths: List of folder paths to search in
|
513
|
+
llm: Language model for relevance filtering
|
514
|
+
max_results: Maximum number of results to return
|
515
|
+
max_filtered_results: Maximum results after filtering
|
516
|
+
embedding_model: Name of the embedding model to use
|
517
|
+
embedding_device: Device to run embeddings on ('cpu' or 'cuda')
|
518
|
+
embedding_model_type: Type of embedding model
|
519
|
+
ollama_base_url: Base URL for Ollama API
|
520
|
+
force_reindex: Whether to force reindexing
|
521
|
+
chunk_size: Size of text chunks for splitting documents
|
522
|
+
chunk_overlap: Overlap between chunks
|
523
|
+
cache_dir: Directory to store embedding cache and index
|
524
|
+
collections: Dictionary of named collections with paths and descriptions
|
525
|
+
"""
|
526
|
+
# Initialize the base search engine
|
527
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
528
|
+
|
529
|
+
# Validate folder paths
|
530
|
+
self.folder_paths = folder_paths
|
531
|
+
self.valid_folder_paths = []
|
532
|
+
for path in folder_paths:
|
533
|
+
if os.path.exists(path) and os.path.isdir(path):
|
534
|
+
self.valid_folder_paths.append(path)
|
535
|
+
else:
|
536
|
+
logger.warning(f"Folder not found or is not a directory: {path}")
|
537
|
+
|
538
|
+
# If no valid folders, log a clear message
|
539
|
+
if not self.valid_folder_paths and folder_paths:
|
540
|
+
logger.warning(f"No valid folders found among: {folder_paths}")
|
541
|
+
logger.warning("This search engine will return no results until valid folders are configured")
|
542
|
+
|
543
|
+
self.max_results = max_results
|
544
|
+
self.collections = collections or {"default": {"paths": folder_paths, "description": "Default collection"}}
|
545
|
+
|
546
|
+
# Initialize the embedding manager with only valid folders
|
547
|
+
self.embedding_manager = LocalEmbeddingManager(
|
548
|
+
embedding_model=embedding_model,
|
549
|
+
embedding_device=embedding_device,
|
550
|
+
embedding_model_type=embedding_model_type,
|
551
|
+
ollama_base_url=ollama_base_url,
|
552
|
+
chunk_size=chunk_size,
|
553
|
+
chunk_overlap=chunk_overlap,
|
554
|
+
cache_dir=cache_dir
|
555
|
+
)
|
556
|
+
|
557
|
+
# Index all folders
|
558
|
+
self._index_folders(force_reindex)
|
559
|
+
|
560
|
+
def _index_folders(self, force_reindex: bool = False):
|
561
|
+
"""Index all valid configured folders"""
|
562
|
+
indexed = []
|
563
|
+
failed = []
|
564
|
+
skipped = []
|
565
|
+
|
566
|
+
# Keep track of invalid folders
|
567
|
+
for folder in self.folder_paths:
|
568
|
+
if folder not in self.valid_folder_paths:
|
569
|
+
skipped.append(folder)
|
570
|
+
continue
|
571
|
+
|
572
|
+
success = self.embedding_manager.index_folder(folder, force_reindex)
|
573
|
+
if success:
|
574
|
+
indexed.append(folder)
|
575
|
+
else:
|
576
|
+
failed.append(folder)
|
577
|
+
|
578
|
+
if indexed:
|
579
|
+
logger.info(f"Successfully indexed {len(indexed)} folders: {', '.join(indexed)}")
|
580
|
+
|
581
|
+
if failed:
|
582
|
+
logger.warning(f"Failed to index {len(failed)} folders: {', '.join(failed)}")
|
583
|
+
|
584
|
+
if skipped:
|
585
|
+
logger.warning(f"Skipped {len(skipped)} invalid folders: {', '.join(skipped)}")
|
586
|
+
|
587
|
+
def _get_previews(self, query: str, collection_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
588
|
+
"""
|
589
|
+
Get preview information for documents matching the query.
|
590
|
+
|
591
|
+
Args:
|
592
|
+
query: The search query
|
593
|
+
collection_names: Specific collections to search within (if None, search all)
|
594
|
+
|
595
|
+
Returns:
|
596
|
+
List of preview dictionaries
|
597
|
+
"""
|
598
|
+
# Determine which collections to search
|
599
|
+
if collection_names:
|
600
|
+
# Search only in specified collections
|
601
|
+
collections_to_search = {name: self.collections[name] for name in collection_names
|
602
|
+
if name in self.collections}
|
603
|
+
if not collections_to_search:
|
604
|
+
logger.warning(f"No valid collections found among: {collection_names}")
|
605
|
+
return []
|
606
|
+
else:
|
607
|
+
# Search in all collections
|
608
|
+
collections_to_search = self.collections
|
609
|
+
|
610
|
+
# Extract all folder paths from the collections to search
|
611
|
+
search_paths = []
|
612
|
+
for collection_config in collections_to_search.values():
|
613
|
+
if "paths" in collection_config:
|
614
|
+
search_paths.extend(collection_config["paths"])
|
615
|
+
|
616
|
+
logger.info(f"Searching local documents in collections: {list(collections_to_search.keys())}")
|
617
|
+
|
618
|
+
# Filter out invalid paths
|
619
|
+
valid_search_paths = [path for path in search_paths if path in self.valid_folder_paths]
|
620
|
+
|
621
|
+
if not valid_search_paths:
|
622
|
+
logger.warning(f"No valid folders to search in collections: {list(collections_to_search.keys())}")
|
623
|
+
return []
|
624
|
+
|
625
|
+
# Search across the valid selected folders
|
626
|
+
raw_results = self.embedding_manager.search(
|
627
|
+
query=query,
|
628
|
+
folder_paths=valid_search_paths,
|
629
|
+
limit=self.max_results,
|
630
|
+
score_threshold=0.1 # Skip very low relevance results
|
631
|
+
)
|
632
|
+
|
633
|
+
if not raw_results:
|
634
|
+
logger.info(f"No local documents found for query: {query}")
|
635
|
+
return []
|
636
|
+
|
637
|
+
# Convert to preview format
|
638
|
+
previews = []
|
639
|
+
for i, result in enumerate(raw_results):
|
640
|
+
# Create a unique ID
|
641
|
+
result_id = f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
|
642
|
+
|
643
|
+
# Extract filename and path
|
644
|
+
source_path = result['metadata'].get('source', 'Unknown')
|
645
|
+
filename = result['metadata'].get('filename', os.path.basename(source_path))
|
646
|
+
|
647
|
+
# Create preview snippet (first ~200 chars of content)
|
648
|
+
snippet = result['content'][:200] + "..." if len(result['content']) > 200 else result['content']
|
649
|
+
|
650
|
+
# Determine which collection this document belongs to
|
651
|
+
collection_name = "Unknown"
|
652
|
+
folder_path = result['folder']
|
653
|
+
for name, collection in self.collections.items():
|
654
|
+
if any(folder_path.startswith(path) for path in collection.get("paths", [])):
|
655
|
+
collection_name = name
|
656
|
+
break
|
657
|
+
|
658
|
+
# Format the preview
|
659
|
+
preview = {
|
660
|
+
"id": result_id,
|
661
|
+
"title": filename,
|
662
|
+
"snippet": snippet,
|
663
|
+
"link": source_path,
|
664
|
+
"similarity": result['similarity'],
|
665
|
+
"folder": folder_path,
|
666
|
+
"collection": collection_name,
|
667
|
+
"collection_description": self.collections.get(collection_name, {}).get("description", ""),
|
668
|
+
"_full_content": result['content'], # Store full content for later
|
669
|
+
"_metadata": result['metadata'] # Store metadata for later
|
670
|
+
}
|
671
|
+
|
672
|
+
previews.append(preview)
|
673
|
+
|
674
|
+
logger.info(f"Found {len(previews)} local document matches")
|
675
|
+
return previews
|
676
|
+
|
677
|
+
def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
678
|
+
"""
|
679
|
+
Get full content for the relevant documents.
|
680
|
+
For local search, the full content is already available.
|
681
|
+
|
682
|
+
Args:
|
683
|
+
relevant_items: List of relevant preview dictionaries
|
684
|
+
|
685
|
+
Returns:
|
686
|
+
List of result dictionaries with full content
|
687
|
+
"""
|
688
|
+
# Check if we should add full content
|
689
|
+
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
690
|
+
logger.info("Snippet-only mode, skipping full content addition")
|
691
|
+
return relevant_items
|
692
|
+
|
693
|
+
# For local search, we already have the full content
|
694
|
+
results = []
|
695
|
+
for item in relevant_items:
|
696
|
+
# Create a copy with full content
|
697
|
+
result = item.copy()
|
698
|
+
|
699
|
+
# Add full content if we have it
|
700
|
+
if "_full_content" in item:
|
701
|
+
result["content"] = item["_full_content"]
|
702
|
+
result["full_content"] = item["_full_content"]
|
703
|
+
|
704
|
+
# Remove temporary fields
|
705
|
+
if "_full_content" in result:
|
706
|
+
del result["_full_content"]
|
707
|
+
|
708
|
+
# Add metadata if we have it
|
709
|
+
if "_metadata" in item:
|
710
|
+
result["document_metadata"] = item["_metadata"]
|
711
|
+
|
712
|
+
# Remove temporary fields
|
713
|
+
if "_metadata" in result:
|
714
|
+
del result["_metadata"]
|
715
|
+
|
716
|
+
results.append(result)
|
717
|
+
|
718
|
+
return results
|
719
|
+
|
720
|
+
def run(self, query: str, collection_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
721
|
+
"""
|
722
|
+
Execute a search using the two-phase approach.
|
723
|
+
|
724
|
+
Args:
|
725
|
+
query: The search query
|
726
|
+
collection_names: Specific collections to search within (if None, search all)
|
727
|
+
|
728
|
+
Returns:
|
729
|
+
List of search result dictionaries with full content
|
730
|
+
"""
|
731
|
+
logger.info(f"---Execute a search using Local Documents---")
|
732
|
+
|
733
|
+
# Check if we have any special collection parameters in the query
|
734
|
+
collection_prefix = "collection:"
|
735
|
+
remaining_query = query
|
736
|
+
specified_collections = []
|
737
|
+
|
738
|
+
# Parse query for collection specifications like "collection:research_papers query terms"
|
739
|
+
query_parts = query.split()
|
740
|
+
for part in query_parts:
|
741
|
+
if part.lower().startswith(collection_prefix):
|
742
|
+
collection_name = part[len(collection_prefix):].strip()
|
743
|
+
if collection_name in self.collections:
|
744
|
+
specified_collections.append(collection_name)
|
745
|
+
# Remove this part from the query
|
746
|
+
remaining_query = remaining_query.replace(part, "", 1).strip()
|
747
|
+
|
748
|
+
# If collections were specified in the query, they override the parameter
|
749
|
+
if specified_collections:
|
750
|
+
collection_names = specified_collections
|
751
|
+
query = remaining_query
|
752
|
+
|
753
|
+
# Phase 1: Get previews (with collection filtering)
|
754
|
+
previews = self._get_previews(query, collection_names)
|
755
|
+
|
756
|
+
if not previews:
|
757
|
+
return []
|
758
|
+
|
759
|
+
# Phase 2: Filter for relevance
|
760
|
+
relevant_items = self._filter_for_relevance(previews, query)
|
761
|
+
|
762
|
+
if not relevant_items:
|
763
|
+
return []
|
764
|
+
|
765
|
+
# Phase 3: Get full content for relevant items
|
766
|
+
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
767
|
+
print("Returning snippet-only results as per config")
|
768
|
+
results = relevant_items
|
769
|
+
else:
|
770
|
+
results = self._get_full_content(relevant_items)
|
771
|
+
|
772
|
+
# Clean up temporary data
|
773
|
+
self.embedding_manager.clear_cache()
|
774
|
+
|
775
|
+
return results
|
776
|
+
|
777
|
+
def get_collections_info(self) -> List[Dict[str, Any]]:
|
778
|
+
"""
|
779
|
+
Get information about all collections, including indexing status.
|
780
|
+
|
781
|
+
Returns:
|
782
|
+
List of collection information dictionaries
|
783
|
+
"""
|
784
|
+
collections_info = []
|
785
|
+
|
786
|
+
for name, collection in self.collections.items():
|
787
|
+
paths = collection.get("paths", [])
|
788
|
+
description = collection.get("description", "")
|
789
|
+
|
790
|
+
# Get indexing information for each path
|
791
|
+
paths_info = []
|
792
|
+
for path in paths:
|
793
|
+
# Check if folder exists
|
794
|
+
exists = os.path.exists(path) and os.path.isdir(path)
|
795
|
+
|
796
|
+
# Check if folder is indexed
|
797
|
+
folder_hash = self.embedding_manager._get_folder_hash(path)
|
798
|
+
indexed = folder_hash in self.embedding_manager.indexed_folders
|
799
|
+
|
800
|
+
# Get index details if available
|
801
|
+
index_info = {}
|
802
|
+
if indexed:
|
803
|
+
index_info = self.embedding_manager.indexed_folders[folder_hash].copy()
|
804
|
+
|
805
|
+
paths_info.append({
|
806
|
+
"path": path,
|
807
|
+
"exists": exists,
|
808
|
+
"indexed": indexed,
|
809
|
+
"index_info": index_info
|
810
|
+
})
|
811
|
+
|
812
|
+
collections_info.append({
|
813
|
+
"name": name,
|
814
|
+
"description": description,
|
815
|
+
"paths": paths,
|
816
|
+
"paths_info": paths_info,
|
817
|
+
"document_count": sum(info.get("index_info", {}).get("file_count", 0) for info in paths_info),
|
818
|
+
"chunk_count": sum(info.get("index_info", {}).get("chunk_count", 0) for info in paths_info),
|
819
|
+
"all_indexed": all(info["indexed"] for info in paths_info if info["exists"])
|
820
|
+
})
|
821
|
+
|
822
|
+
return collections_info
|
823
|
+
|
824
|
+
def reindex_collection(self, collection_name: str) -> bool:
|
825
|
+
"""
|
826
|
+
Reindex a specific collection.
|
827
|
+
|
828
|
+
Args:
|
829
|
+
collection_name: Name of the collection to reindex
|
830
|
+
|
831
|
+
Returns:
|
832
|
+
True if reindexing was successful, False otherwise
|
833
|
+
"""
|
834
|
+
if collection_name not in self.collections:
|
835
|
+
logger.error(f"Collection '{collection_name}' not found")
|
836
|
+
return False
|
837
|
+
|
838
|
+
paths = self.collections[collection_name].get("paths", [])
|
839
|
+
success = True
|
840
|
+
|
841
|
+
for path in paths:
|
842
|
+
if not self.embedding_manager.index_folder(path, force_reindex=True):
|
843
|
+
success = False
|
844
|
+
|
845
|
+
return success
|
846
|
+
|
847
|
+
@classmethod
|
848
|
+
def from_config(cls, config_dict: Dict[str, Any], llm: Optional[BaseLLM] = None) -> "LocalSearchEngine":
|
849
|
+
"""
|
850
|
+
Create a LocalSearchEngine instance from a configuration dictionary.
|
851
|
+
|
852
|
+
Args:
|
853
|
+
config_dict: Configuration dictionary
|
854
|
+
llm: Language model for relevance filtering
|
855
|
+
|
856
|
+
Returns:
|
857
|
+
Initialized LocalSearchEngine instance
|
858
|
+
"""
|
859
|
+
# Required parameters
|
860
|
+
folder_paths = []
|
861
|
+
collections = config_dict.get("collections", {})
|
862
|
+
|
863
|
+
# Extract all folder paths from collections
|
864
|
+
for collection_config in collections.values():
|
865
|
+
if "paths" in collection_config:
|
866
|
+
folder_paths.extend(collection_config["paths"])
|
867
|
+
|
868
|
+
# Fall back to folder_paths if no collections defined
|
869
|
+
if not folder_paths:
|
870
|
+
folder_paths = config_dict.get("folder_paths", [])
|
871
|
+
# Create a default collection if using folder_paths
|
872
|
+
if folder_paths:
|
873
|
+
collections = {"default": {"paths": folder_paths, "description": "Default collection"}}
|
874
|
+
|
875
|
+
# Optional parameters with defaults
|
876
|
+
max_results = config_dict.get("max_results", 10)
|
877
|
+
max_filtered_results = config_dict.get("max_filtered_results")
|
878
|
+
embedding_model = config_dict.get("embedding_model", "all-MiniLM-L6-v2")
|
879
|
+
embedding_device = config_dict.get("embedding_device", "cpu")
|
880
|
+
embedding_model_type = config_dict.get("embedding_model_type", "sentence_transformers")
|
881
|
+
ollama_base_url = config_dict.get("ollama_base_url")
|
882
|
+
force_reindex = config_dict.get("force_reindex", False)
|
883
|
+
chunk_size = config_dict.get("chunk_size", 1000)
|
884
|
+
chunk_overlap = config_dict.get("chunk_overlap", 200)
|
885
|
+
cache_dir = config_dict.get("cache_dir", ".cache/local_search")
|
886
|
+
|
887
|
+
return cls(
|
888
|
+
folder_paths=folder_paths,
|
889
|
+
collections=collections,
|
890
|
+
llm=llm,
|
891
|
+
max_results=max_results,
|
892
|
+
max_filtered_results=max_filtered_results,
|
893
|
+
embedding_model=embedding_model,
|
894
|
+
embedding_device=embedding_device,
|
895
|
+
embedding_model_type=embedding_model_type,
|
896
|
+
ollama_base_url=ollama_base_url,
|
897
|
+
force_reindex=force_reindex,
|
898
|
+
chunk_size=chunk_size,
|
899
|
+
chunk_overlap=chunk_overlap,
|
900
|
+
cache_dir=cache_dir
|
901
|
+
)
|