local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +23 -22
- local_deep_research/__main__.py +16 -0
- local_deep_research/advanced_search_system/__init__.py +7 -0
- local_deep_research/advanced_search_system/filters/__init__.py +8 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
- local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
- local_deep_research/advanced_search_system/findings/repository.py +452 -0
- local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
- local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
- local_deep_research/advanced_search_system/questions/__init__.py +1 -0
- local_deep_research/advanced_search_system/questions/base_question.py +64 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
- local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
- local_deep_research/advanced_search_system/tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
- local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
- local_deep_research/api/__init__.py +5 -5
- local_deep_research/api/research_functions.py +154 -160
- local_deep_research/app.py +8 -0
- local_deep_research/citation_handler.py +25 -16
- local_deep_research/{config.py → config/config_files.py} +102 -110
- local_deep_research/config/llm_config.py +472 -0
- local_deep_research/config/search_config.py +77 -0
- local_deep_research/defaults/__init__.py +10 -5
- local_deep_research/defaults/main.toml +2 -2
- local_deep_research/defaults/search_engines.toml +60 -34
- local_deep_research/main.py +121 -19
- local_deep_research/migrate_db.py +147 -0
- local_deep_research/report_generator.py +87 -45
- local_deep_research/search_system.py +153 -283
- local_deep_research/setup_data_dir.py +35 -0
- local_deep_research/test_migration.py +178 -0
- local_deep_research/utilities/__init__.py +0 -0
- local_deep_research/utilities/db_utils.py +49 -0
- local_deep_research/{utilties → utilities}/enums.py +2 -2
- local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
- local_deep_research/utilities/search_utilities.py +242 -0
- local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
- local_deep_research/web/__init__.py +0 -1
- local_deep_research/web/app.py +86 -1709
- local_deep_research/web/app_factory.py +289 -0
- local_deep_research/web/database/README.md +70 -0
- local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
- local_deep_research/web/database/migrations.py +447 -0
- local_deep_research/web/database/models.py +117 -0
- local_deep_research/web/database/schema_upgrade.py +107 -0
- local_deep_research/web/models/database.py +294 -0
- local_deep_research/web/models/settings.py +94 -0
- local_deep_research/web/routes/api_routes.py +559 -0
- local_deep_research/web/routes/history_routes.py +354 -0
- local_deep_research/web/routes/research_routes.py +715 -0
- local_deep_research/web/routes/settings_routes.py +1583 -0
- local_deep_research/web/services/research_service.py +947 -0
- local_deep_research/web/services/resource_service.py +149 -0
- local_deep_research/web/services/settings_manager.py +669 -0
- local_deep_research/web/services/settings_service.py +187 -0
- local_deep_research/web/services/socket_service.py +210 -0
- local_deep_research/web/static/css/custom_dropdown.css +277 -0
- local_deep_research/web/static/css/settings.css +1223 -0
- local_deep_research/web/static/css/styles.css +525 -48
- local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
- local_deep_research/web/static/js/components/detail.js +348 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
- local_deep_research/web/static/js/components/fallback/ui.js +215 -0
- local_deep_research/web/static/js/components/history.js +487 -0
- local_deep_research/web/static/js/components/logpanel.js +949 -0
- local_deep_research/web/static/js/components/progress.js +1107 -0
- local_deep_research/web/static/js/components/research.js +1865 -0
- local_deep_research/web/static/js/components/results.js +766 -0
- local_deep_research/web/static/js/components/settings.js +3981 -0
- local_deep_research/web/static/js/components/settings_sync.js +106 -0
- local_deep_research/web/static/js/main.js +226 -0
- local_deep_research/web/static/js/services/api.js +253 -0
- local_deep_research/web/static/js/services/audio.js +31 -0
- local_deep_research/web/static/js/services/formatting.js +119 -0
- local_deep_research/web/static/js/services/pdf.js +622 -0
- local_deep_research/web/static/js/services/socket.js +882 -0
- local_deep_research/web/static/js/services/ui.js +546 -0
- local_deep_research/web/templates/base.html +72 -0
- local_deep_research/web/templates/components/custom_dropdown.html +47 -0
- local_deep_research/web/templates/components/log_panel.html +32 -0
- local_deep_research/web/templates/components/mobile_nav.html +22 -0
- local_deep_research/web/templates/components/settings_form.html +299 -0
- local_deep_research/web/templates/components/sidebar.html +21 -0
- local_deep_research/web/templates/pages/details.html +73 -0
- local_deep_research/web/templates/pages/history.html +51 -0
- local_deep_research/web/templates/pages/progress.html +57 -0
- local_deep_research/web/templates/pages/research.html +139 -0
- local_deep_research/web/templates/pages/results.html +59 -0
- local_deep_research/web/templates/settings_dashboard.html +78 -192
- local_deep_research/web/utils/__init__.py +0 -0
- local_deep_research/web/utils/formatters.py +76 -0
- local_deep_research/web_search_engines/engines/full_search.py +18 -16
- local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
- local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
- local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
- local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
- local_deep_research/web_search_engines/search_engine_base.py +174 -99
- local_deep_research/web_search_engines/search_engine_factory.py +192 -102
- local_deep_research/web_search_engines/search_engines_config.py +22 -15
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
- local_deep_research-0.2.2.dist-info/RECORD +135 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
- local_deep_research/defaults/llm_config.py +0 -338
- local_deep_research/utilties/search_utilities.py +0 -114
- local_deep_research/web/static/js/app.js +0 -3763
- local_deep_research/web/templates/api_keys_config.html +0 -82
- local_deep_research/web/templates/collections_config.html +0 -90
- local_deep_research/web/templates/index.html +0 -348
- local_deep_research/web/templates/llm_config.html +0 -120
- local_deep_research/web/templates/main_config.html +0 -89
- local_deep_research/web/templates/search_engines_config.html +0 -154
- local_deep_research/web/templates/settings.html +0 -519
- local_deep_research-0.1.26.dist-info/RECORD +0 -61
- local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
- /local_deep_research/{utilties → config}/__init__.py +0 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,46 +1,107 @@
|
|
1
|
-
from typing import Dict, List, Any, Optional, Tuple, Union
|
2
|
-
import os
|
3
|
-
import json
|
4
1
|
import hashlib
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import os
|
5
5
|
import time
|
6
|
+
import uuid
|
7
|
+
from concurrent.futures import ProcessPoolExecutor
|
6
8
|
from datetime import datetime
|
7
9
|
from pathlib import Path
|
8
|
-
import
|
9
|
-
import logging
|
10
|
-
import re
|
11
|
-
import pickle
|
10
|
+
from typing import Any, Dict, Iterable, List, Optional
|
12
11
|
|
13
|
-
from faiss import
|
14
|
-
from
|
12
|
+
from faiss import IndexFlatL2
|
13
|
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
15
14
|
from langchain_community.document_loaders import (
|
16
|
-
PyPDFLoader,
|
17
|
-
TextLoader,
|
18
|
-
UnstructuredMarkdownLoader,
|
19
|
-
UnstructuredWordDocumentLoader,
|
20
15
|
CSVLoader,
|
16
|
+
PyPDFLoader,
|
17
|
+
TextLoader,
|
21
18
|
UnstructuredExcelLoader,
|
22
|
-
|
19
|
+
UnstructuredMarkdownLoader,
|
20
|
+
UnstructuredWordDocumentLoader,
|
23
21
|
)
|
24
22
|
from langchain_community.document_loaders.base import BaseLoader
|
25
|
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
26
|
-
from langchain_community.vectorstores import FAISS
|
27
|
-
from langchain_community.vectorstores.utils import DistanceStrategy
|
28
23
|
from langchain_community.embeddings import (
|
29
24
|
HuggingFaceEmbeddings,
|
30
25
|
OllamaEmbeddings,
|
31
|
-
SentenceTransformerEmbeddings
|
26
|
+
SentenceTransformerEmbeddings,
|
32
27
|
)
|
28
|
+
from langchain_community.vectorstores import FAISS
|
29
|
+
from langchain_core.documents import Document
|
30
|
+
from langchain_core.language_models import BaseLLM
|
31
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
33
32
|
|
34
|
-
from
|
35
|
-
from
|
33
|
+
from ...config import search_config
|
34
|
+
from ..search_engine_base import BaseSearchEngine
|
36
35
|
|
37
36
|
# Setup logging
|
38
37
|
logging.basicConfig(level=logging.INFO)
|
39
38
|
logger = logging.getLogger(__name__)
|
40
39
|
|
40
|
+
|
41
|
+
def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
|
42
|
+
"""Get an appropriate document loader for a file based on its extension"""
|
43
|
+
file_path = Path(file_path)
|
44
|
+
extension = file_path.suffix.lower()
|
45
|
+
|
46
|
+
try:
|
47
|
+
if extension == ".pdf":
|
48
|
+
return PyPDFLoader(str(file_path))
|
49
|
+
elif extension == ".txt":
|
50
|
+
return TextLoader(str(file_path))
|
51
|
+
elif extension in [".md", ".markdown"]:
|
52
|
+
return UnstructuredMarkdownLoader(str(file_path))
|
53
|
+
elif extension in [".doc", ".docx"]:
|
54
|
+
return UnstructuredWordDocumentLoader(str(file_path))
|
55
|
+
elif extension == ".csv":
|
56
|
+
return CSVLoader(str(file_path))
|
57
|
+
elif extension in [".xls", ".xlsx"]:
|
58
|
+
return UnstructuredExcelLoader(str(file_path))
|
59
|
+
else:
|
60
|
+
# Try the text loader as a fallback for unknown extensions
|
61
|
+
logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
|
62
|
+
return TextLoader(str(file_path), encoding="utf-8")
|
63
|
+
except Exception as e:
|
64
|
+
logger.error(f"Error creating loader for {file_path}: {e}")
|
65
|
+
return None
|
66
|
+
|
67
|
+
|
68
|
+
def _load_document(file_path: Path) -> List[Document]:
|
69
|
+
"""
|
70
|
+
Loads documents from a file.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
file_path: The path to the document to load.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
The loaded documents, or an empty list if it failed to load.
|
77
|
+
|
78
|
+
"""
|
79
|
+
# Get a loader for this file
|
80
|
+
loader = _get_file_loader(str(file_path))
|
81
|
+
|
82
|
+
if loader is None:
|
83
|
+
# No loader for this filetype.
|
84
|
+
return []
|
85
|
+
|
86
|
+
try:
|
87
|
+
# Load the document
|
88
|
+
docs = loader.load()
|
89
|
+
|
90
|
+
# Add source path metadata and ID.
|
91
|
+
for doc in docs:
|
92
|
+
doc.metadata["source"] = str(file_path)
|
93
|
+
doc.metadata["filename"] = file_path.name
|
94
|
+
|
95
|
+
except Exception as e:
|
96
|
+
logger.error(f"Error loading {file_path}: {e}")
|
97
|
+
return []
|
98
|
+
|
99
|
+
return docs
|
100
|
+
|
101
|
+
|
41
102
|
class LocalEmbeddingManager:
|
42
103
|
"""Handles embedding generation and storage for local document search"""
|
43
|
-
|
104
|
+
|
44
105
|
def __init__(
|
45
106
|
self,
|
46
107
|
embedding_model: str = "all-MiniLM-L6-v2",
|
@@ -53,7 +114,7 @@ class LocalEmbeddingManager:
|
|
53
114
|
):
|
54
115
|
"""
|
55
116
|
Initialize the embedding manager for local document search.
|
56
|
-
|
117
|
+
|
57
118
|
Args:
|
58
119
|
embedding_model: Name of the embedding model to use
|
59
120
|
embedding_device: Device to run embeddings on ('cpu' or 'cuda')
|
@@ -63,7 +124,7 @@ class LocalEmbeddingManager:
|
|
63
124
|
chunk_overlap: Overlap between chunks
|
64
125
|
cache_dir: Directory to store embedding cache and index
|
65
126
|
"""
|
66
|
-
|
127
|
+
|
67
128
|
self.embedding_model = embedding_model
|
68
129
|
self.embedding_device = embedding_device
|
69
130
|
self.embedding_model_type = embedding_model_type
|
@@ -71,29 +132,29 @@ class LocalEmbeddingManager:
|
|
71
132
|
self.chunk_size = chunk_size
|
72
133
|
self.chunk_overlap = chunk_overlap
|
73
134
|
self.cache_dir = Path(cache_dir)
|
74
|
-
|
135
|
+
|
75
136
|
# Create cache directory if it doesn't exist
|
76
137
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
77
|
-
|
138
|
+
|
78
139
|
# Initialize the embedding model
|
79
140
|
self._embeddings = None
|
80
|
-
|
141
|
+
|
81
142
|
# Initialize the text splitter
|
82
143
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
83
|
-
chunk_size=chunk_size,
|
84
|
-
chunk_overlap=chunk_overlap
|
144
|
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
85
145
|
)
|
86
|
-
|
146
|
+
|
87
147
|
# Track indexed folders and their metadata
|
88
148
|
self.indexed_folders = self._load_indexed_folders()
|
89
|
-
|
149
|
+
|
90
150
|
# Vector store cache
|
91
151
|
self.vector_stores = {}
|
152
|
+
|
92
153
|
@property
|
93
154
|
def embeddings(self):
|
94
155
|
"""
|
95
156
|
Lazily initialize embeddings when first accessed.
|
96
|
-
This allows the LocalEmbeddingManager to be created without
|
157
|
+
This allows the LocalEmbeddingManager to be created without
|
97
158
|
immediately loading models, which is helpful when no local search is performed.
|
98
159
|
"""
|
99
160
|
if self._embeddings is None:
|
@@ -107,30 +168,38 @@ class LocalEmbeddingManager:
|
|
107
168
|
if self.embedding_model_type == "ollama":
|
108
169
|
# Use Ollama for embeddings
|
109
170
|
if not self.ollama_base_url:
|
110
|
-
self.ollama_base_url = os.getenv(
|
111
|
-
|
112
|
-
|
171
|
+
self.ollama_base_url = os.getenv(
|
172
|
+
"OLLAMA_BASE_URL", "http://localhost:11434"
|
173
|
+
)
|
174
|
+
|
175
|
+
logger.info(
|
176
|
+
f"Initializing Ollama embeddings with model {self.embedding_model}"
|
177
|
+
)
|
113
178
|
return OllamaEmbeddings(
|
114
|
-
model=self.embedding_model,
|
115
|
-
base_url=self.ollama_base_url
|
179
|
+
model=self.embedding_model, base_url=self.ollama_base_url
|
116
180
|
)
|
117
181
|
else:
|
118
182
|
# Default: Use SentenceTransformers/HuggingFace
|
119
|
-
logger.info(
|
183
|
+
logger.info(
|
184
|
+
f"Initializing SentenceTransformerEmbeddings with model {self.embedding_model}"
|
185
|
+
)
|
120
186
|
return SentenceTransformerEmbeddings(
|
121
187
|
model_name=self.embedding_model,
|
122
|
-
model_kwargs={"device": self.embedding_device}
|
188
|
+
model_kwargs={"device": self.embedding_device},
|
123
189
|
)
|
124
190
|
except Exception as e:
|
125
191
|
logger.error(f"Error initializing embeddings: {e}")
|
126
|
-
logger.warning(
|
192
|
+
logger.warning(
|
193
|
+
"Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2"
|
194
|
+
)
|
127
195
|
return HuggingFaceEmbeddings(
|
128
196
|
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
129
197
|
)
|
198
|
+
|
130
199
|
def _load_or_create_vector_store(self):
|
131
200
|
"""Load the vector store from disk or create it if needed"""
|
132
201
|
vector_store_path = self._get_vector_store_path()
|
133
|
-
|
202
|
+
|
134
203
|
# Check if vector store exists and is up to date
|
135
204
|
if vector_store_path.exists() and not self._check_folders_modified():
|
136
205
|
logger.info(f"Loading existing vector store from {vector_store_path}")
|
@@ -139,86 +208,121 @@ class LocalEmbeddingManager:
|
|
139
208
|
str(vector_store_path),
|
140
209
|
self.embeddings,
|
141
210
|
allow_dangerous_deserialization=True,
|
142
|
-
normalize_L2=True
|
211
|
+
normalize_L2=True,
|
143
212
|
)
|
144
|
-
|
213
|
+
|
145
214
|
# Add this code to show document count
|
146
215
|
doc_count = len(vector_store.index_to_docstore_id)
|
147
216
|
logger.info(f"Loaded index with {doc_count} document chunks")
|
148
|
-
|
217
|
+
|
149
218
|
return vector_store
|
150
219
|
except Exception as e:
|
151
220
|
logger.error(f"Error loading vector store: {e}")
|
152
221
|
logger.info("Will create a new vector store")
|
153
|
-
|
222
|
+
|
154
223
|
# Create a new vector store
|
155
|
-
return self._create_vector_store()
|
224
|
+
return self._create_vector_store()
|
225
|
+
|
156
226
|
def _load_indexed_folders(self) -> Dict[str, Dict[str, Any]]:
|
157
227
|
"""Load metadata about indexed folders from disk"""
|
158
228
|
index_metadata_path = self.cache_dir / "index_metadata.json"
|
159
|
-
|
229
|
+
|
160
230
|
if index_metadata_path.exists():
|
161
231
|
try:
|
162
232
|
with open(index_metadata_path, "r") as f:
|
163
233
|
return json.load(f)
|
164
234
|
except Exception as e:
|
165
235
|
logger.error(f"Error loading index metadata: {e}")
|
166
|
-
|
236
|
+
|
167
237
|
return {}
|
168
|
-
|
238
|
+
|
169
239
|
def _save_indexed_folders(self):
|
170
240
|
"""Save metadata about indexed folders to disk"""
|
171
241
|
index_metadata_path = self.cache_dir / "index_metadata.json"
|
172
|
-
|
242
|
+
|
173
243
|
try:
|
174
244
|
with open(index_metadata_path, "w") as f:
|
175
245
|
json.dump(self.indexed_folders, f, indent=2)
|
176
246
|
except Exception as e:
|
177
247
|
logger.error(f"Error saving index metadata: {e}")
|
178
|
-
|
179
|
-
|
248
|
+
|
249
|
+
@staticmethod
|
250
|
+
def get_folder_hash(folder_path: Path) -> str:
|
180
251
|
"""Generate a hash for a folder based on its path"""
|
181
|
-
#
|
182
|
-
|
183
|
-
|
252
|
+
# Canonicalize the path so we don't have weird Windows vs. Linux
|
253
|
+
# problems or issues with trailing slashes.
|
254
|
+
canonical_folder_path = "/".join(folder_path.parts)
|
255
|
+
return hashlib.md5(canonical_folder_path.encode()).hexdigest()
|
184
256
|
|
185
|
-
|
186
|
-
|
187
|
-
def _get_index_path(self, folder_path: str) -> Path:
|
257
|
+
def _get_index_path(self, folder_path: Path) -> Path:
|
188
258
|
"""Get the path where the index for a specific folder should be stored"""
|
189
|
-
folder_hash = self.
|
259
|
+
folder_hash = self.get_folder_hash(folder_path)
|
190
260
|
return self.cache_dir / f"index_{folder_hash}"
|
191
|
-
|
192
|
-
def _check_folder_modified(self, folder_path:
|
261
|
+
|
262
|
+
def _check_folder_modified(self, folder_path: Path) -> bool:
|
193
263
|
"""Check if a folder has been modified since it was last indexed"""
|
194
|
-
|
195
|
-
|
264
|
+
|
265
|
+
@staticmethod
|
266
|
+
def _get_all_files(folder_path: Path) -> Iterable[Path]:
|
267
|
+
"""
|
268
|
+
Gets all the files, recursively, in a folder.
|
269
|
+
|
270
|
+
Args:
|
271
|
+
folder_path: The path to the folder.
|
272
|
+
|
273
|
+
Yields:
|
274
|
+
Each of the files in the folder.
|
275
|
+
|
276
|
+
"""
|
277
|
+
for root, _, files in os.walk(folder_path):
|
278
|
+
for file in files:
|
279
|
+
yield Path(root) / file
|
280
|
+
|
281
|
+
def _get_modified_files(self, folder_path: Path) -> List[Path]:
|
282
|
+
"""
|
283
|
+
Gets the files in a folder that have been modified since it was last
|
284
|
+
indexed.
|
285
|
+
|
286
|
+
Args:
|
287
|
+
folder_path: The path to the folder to check.
|
288
|
+
|
289
|
+
Returns:
|
290
|
+
A list of the files that were modified.
|
291
|
+
|
292
|
+
"""
|
196
293
|
if not folder_path.exists() or not folder_path.is_dir():
|
197
|
-
return
|
198
|
-
|
199
|
-
folder_hash = self.
|
200
|
-
|
201
|
-
# If folder has never been indexed, it's considered modified
|
294
|
+
return []
|
295
|
+
|
296
|
+
folder_hash = self.get_folder_hash(folder_path)
|
297
|
+
|
202
298
|
if folder_hash not in self.indexed_folders:
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
299
|
+
# If folder has never been indexed, everything has been modified.
|
300
|
+
last_indexed = 0
|
301
|
+
indexed_files = set()
|
302
|
+
else:
|
303
|
+
last_indexed = self.indexed_folders[folder_hash].get("last_indexed", 0)
|
304
|
+
indexed_files = (
|
305
|
+
self.indexed_folders[folder_hash].get("indexed_files", {}).keys()
|
306
|
+
)
|
307
|
+
|
207
308
|
# Check if any file in the folder has been modified since last indexing
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
309
|
+
modified_files = []
|
310
|
+
for file_path in self._get_all_files(folder_path):
|
311
|
+
file_stats = file_path.stat()
|
312
|
+
if file_stats.st_mtime > last_indexed:
|
313
|
+
modified_files.append(file_path)
|
314
|
+
elif str(file_path.relative_to(folder_path)) not in indexed_files:
|
315
|
+
# This file somehow never got indexed.
|
316
|
+
modified_files.append(file_path)
|
215
317
|
|
216
|
-
|
318
|
+
return modified_files
|
319
|
+
|
320
|
+
def _check_config_changed(self, folder_path: Path) -> bool:
|
217
321
|
"""
|
218
322
|
Checks if the embedding configuration for a folder has been changed
|
219
323
|
since it was last indexed.
|
220
324
|
"""
|
221
|
-
folder_hash = self.
|
325
|
+
folder_hash = self.get_folder_hash(folder_path)
|
222
326
|
|
223
327
|
if folder_hash not in self.indexed_folders:
|
224
328
|
# It hasn't been indexed at all. That's a new configuration,
|
@@ -231,228 +335,225 @@ class LocalEmbeddingManager:
|
|
231
335
|
embedding_model = embedding_config.get("embedding_model", "")
|
232
336
|
|
233
337
|
if (chunk_size, chunk_overlap, embedding_model) != (
|
234
|
-
|
338
|
+
self.chunk_size,
|
339
|
+
self.chunk_overlap,
|
340
|
+
self.embedding_model,
|
235
341
|
):
|
236
|
-
logger.info(
|
237
|
-
"Embedding configuration has changed, re-indexing folder."
|
238
|
-
)
|
342
|
+
logger.info("Embedding configuration has changed, re-indexing folder.")
|
239
343
|
return True
|
240
344
|
return False
|
241
|
-
|
242
|
-
def get_file_loader(self, file_path: str) -> Optional[BaseLoader]:
|
243
|
-
"""Get an appropriate document loader for a file based on its extension"""
|
244
|
-
file_path = Path(file_path)
|
245
|
-
extension = file_path.suffix.lower()
|
246
|
-
|
247
|
-
try:
|
248
|
-
if extension == ".pdf":
|
249
|
-
return PyPDFLoader(str(file_path))
|
250
|
-
elif extension == ".txt":
|
251
|
-
return TextLoader(str(file_path))
|
252
|
-
elif extension in [".md", ".markdown"]:
|
253
|
-
return UnstructuredMarkdownLoader(str(file_path))
|
254
|
-
elif extension in [".doc", ".docx"]:
|
255
|
-
return UnstructuredWordDocumentLoader(str(file_path))
|
256
|
-
elif extension == ".csv":
|
257
|
-
return CSVLoader(str(file_path))
|
258
|
-
elif extension in [".xls", ".xlsx"]:
|
259
|
-
return UnstructuredExcelLoader(str(file_path))
|
260
|
-
else:
|
261
|
-
# Try the text loader as a fallback for unknown extensions
|
262
|
-
logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
|
263
|
-
return TextLoader(str(file_path), encoding="utf-8")
|
264
|
-
except Exception as e:
|
265
|
-
logger.error(f"Error creating loader for {file_path}: {e}")
|
266
|
-
return None
|
267
|
-
|
345
|
+
|
268
346
|
def index_folder(self, folder_path: str, force_reindex: bool = False) -> bool:
|
269
347
|
"""
|
270
348
|
Index all documents in a folder for vector search.
|
271
|
-
|
349
|
+
|
272
350
|
Args:
|
273
351
|
folder_path: Path to the folder to index
|
274
352
|
force_reindex: Whether to force reindexing even if unchanged
|
275
|
-
|
353
|
+
|
276
354
|
Returns:
|
277
355
|
bool: True if indexing was successful, False otherwise
|
278
356
|
"""
|
279
357
|
folder_path = Path(folder_path)
|
280
|
-
|
358
|
+
|
281
359
|
# Validate folder
|
282
360
|
if not folder_path.exists():
|
283
361
|
logger.error(f"Folder not found: {folder_path}")
|
284
362
|
return False
|
285
|
-
|
363
|
+
|
286
364
|
if not folder_path.is_dir():
|
287
365
|
logger.error(f"Path is not a directory: {folder_path}")
|
288
366
|
return False
|
289
|
-
|
367
|
+
|
290
368
|
folder_str = str(folder_path)
|
291
|
-
folder_hash = self.
|
292
|
-
index_path = self._get_index_path(
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
if not force_reindex and folder_hash in self.vector_stores:
|
318
|
-
return True
|
319
|
-
|
369
|
+
folder_hash = self.get_folder_hash(folder_path)
|
370
|
+
index_path = self._get_index_path(folder_path)
|
371
|
+
|
372
|
+
if force_reindex or self._check_config_changed(folder_path):
|
373
|
+
logger.info(f"Re-indexing entire folder: {folder_path}")
|
374
|
+
modified_files = list(self._get_all_files(folder_path))
|
375
|
+
else:
|
376
|
+
# Just re-index the modified files if we can get away with it.
|
377
|
+
modified_files = self._get_modified_files(folder_path)
|
378
|
+
logger.info(f"Re-indexing {len(modified_files)} modified files...")
|
379
|
+
|
380
|
+
# Load the vector store from disk if not already loaded
|
381
|
+
if folder_hash not in self.vector_stores and index_path.exists():
|
382
|
+
try:
|
383
|
+
self.vector_stores[folder_hash] = FAISS.load_local(
|
384
|
+
str(index_path),
|
385
|
+
self.embeddings,
|
386
|
+
allow_dangerous_deserialization=True,
|
387
|
+
normalize_L2=True,
|
388
|
+
)
|
389
|
+
logger.info(f"Loaded index for {folder_path} from disk")
|
390
|
+
except Exception as e:
|
391
|
+
logger.error(f"Error loading index for {folder_path}: {e}")
|
392
|
+
# If loading fails, force reindexing
|
393
|
+
force_reindex = True
|
394
|
+
|
320
395
|
logger.info(f"Indexing folder: {folder_path}")
|
321
396
|
start_time = time.time()
|
322
|
-
|
397
|
+
|
323
398
|
# Find documents to index
|
324
399
|
all_docs = []
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
for
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
error_count += 1
|
354
|
-
|
355
|
-
if not all_docs:
|
356
|
-
logger.warning(f"No documents found in {folder_path} or all documents failed to load")
|
357
|
-
return False
|
358
|
-
|
400
|
+
|
401
|
+
# Remove hidden files and directories.
|
402
|
+
modified_files = [
|
403
|
+
p
|
404
|
+
for p in modified_files
|
405
|
+
if not p.name.startswith(".")
|
406
|
+
and not any(part.startswith(".") for part in p.parts)
|
407
|
+
]
|
408
|
+
# Index them.
|
409
|
+
with ProcessPoolExecutor() as executor:
|
410
|
+
all_docs_nested = executor.map(_load_document, modified_files)
|
411
|
+
# Flatten the result.
|
412
|
+
for docs in all_docs_nested:
|
413
|
+
all_docs.extend(docs)
|
414
|
+
|
415
|
+
if force_reindex or folder_hash not in self.vector_stores:
|
416
|
+
logger.info(f"Creating new index for {folder_path}")
|
417
|
+
# Embed a test query to figure out embedding length.
|
418
|
+
test_embedding = self.embeddings.embed_query("hello world")
|
419
|
+
index = IndexFlatL2(len(test_embedding))
|
420
|
+
self.vector_stores[folder_hash] = FAISS(
|
421
|
+
self.embeddings,
|
422
|
+
index=index,
|
423
|
+
docstore=InMemoryDocstore(),
|
424
|
+
index_to_docstore_id={},
|
425
|
+
normalize_L2=True,
|
426
|
+
)
|
427
|
+
|
359
428
|
# Split documents into chunks
|
360
429
|
logger.info(f"Splitting {len(all_docs)} documents into chunks")
|
361
430
|
splits = self.text_splitter.split_documents(all_docs)
|
362
|
-
logger.info(f"Created {len(splits)} chunks from {
|
363
|
-
|
431
|
+
logger.info(f"Created {len(splits)} chunks from {len(modified_files)} files")
|
432
|
+
|
364
433
|
# Create vector store
|
365
|
-
|
366
|
-
|
367
|
-
splits
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
434
|
+
ids = []
|
435
|
+
if splits:
|
436
|
+
logger.info(f"Adding {len(splits)} chunks to vector store")
|
437
|
+
ids = [uuid.uuid4().hex for _ in splits]
|
438
|
+
self.vector_stores[folder_hash].add_documents(splits, ids=ids)
|
439
|
+
|
440
|
+
# Update indexing time for individual files.
|
441
|
+
index_time = time.time()
|
442
|
+
indexed_files = {}
|
443
|
+
if folder_hash in self.indexed_folders:
|
444
|
+
indexed_files = (
|
445
|
+
self.indexed_folders[folder_hash].get("indexed_files", {}).copy()
|
446
|
+
)
|
447
|
+
for split_id, split in zip(ids, splits):
|
448
|
+
split_source = str(Path(split.metadata["source"]).relative_to(folder_path))
|
449
|
+
id_list = indexed_files.setdefault(split_source, [])
|
450
|
+
id_list.append(split_id)
|
451
|
+
|
452
|
+
# Check for any files that were removed and remove them from the
|
453
|
+
# vector store.
|
454
|
+
delete_ids = []
|
455
|
+
delete_paths = []
|
456
|
+
for relative_path, chunk_ids in indexed_files.items():
|
457
|
+
if not (folder_path / Path(relative_path)).exists():
|
458
|
+
delete_ids.extend(chunk_ids)
|
459
|
+
delete_paths.append(relative_path)
|
460
|
+
if delete_ids:
|
461
|
+
logger.info(
|
462
|
+
f"Deleting {len(delete_paths)} non-existent files from the " f"index."
|
463
|
+
)
|
464
|
+
self.vector_stores[folder_hash].delete(delete_ids)
|
465
|
+
for path in delete_paths:
|
466
|
+
del indexed_files[path]
|
467
|
+
|
372
468
|
# Save the vector store to disk
|
373
469
|
logger.info(f"Saving index to {index_path}")
|
374
|
-
|
375
|
-
|
376
|
-
# Update cache
|
377
|
-
self.vector_stores[folder_hash] = vector_store
|
378
|
-
|
470
|
+
self.vector_stores[folder_hash].save_local(str(index_path))
|
471
|
+
|
379
472
|
# Update metadata
|
380
473
|
self.indexed_folders[folder_hash] = {
|
381
474
|
"path": folder_str,
|
382
|
-
"last_indexed":
|
383
|
-
"file_count":
|
475
|
+
"last_indexed": index_time,
|
476
|
+
"file_count": len(modified_files),
|
384
477
|
"chunk_count": len(splits),
|
385
|
-
"error_count": error_count,
|
386
478
|
"embedding_model": self.embedding_model,
|
387
479
|
"chunk_size": self.chunk_size,
|
388
|
-
"chunk_overlap": self.chunk_overlap
|
480
|
+
"chunk_overlap": self.chunk_overlap,
|
481
|
+
"indexed_files": indexed_files,
|
389
482
|
}
|
390
|
-
|
483
|
+
|
391
484
|
# Save updated metadata
|
392
485
|
self._save_indexed_folders()
|
393
|
-
|
486
|
+
|
394
487
|
elapsed_time = time.time() - start_time
|
395
|
-
logger.info(
|
396
|
-
|
488
|
+
logger.info(
|
489
|
+
f"Indexed {len(modified_files)} files in {elapsed_time:.2f} seconds"
|
490
|
+
)
|
491
|
+
|
397
492
|
return True
|
398
|
-
|
493
|
+
|
399
494
|
def search(
|
400
|
-
self,
|
401
|
-
query: str,
|
495
|
+
self,
|
496
|
+
query: str,
|
402
497
|
folder_paths: List[str],
|
403
498
|
limit: int = 10,
|
404
499
|
score_threshold: float = 0.0,
|
405
500
|
) -> List[Dict[str, Any]]:
|
406
501
|
"""
|
407
502
|
Search for documents relevant to a query across specified folders.
|
408
|
-
|
503
|
+
|
409
504
|
Args:
|
410
505
|
query: The search query
|
411
506
|
folder_paths: List of folder paths to search in
|
412
507
|
limit: Maximum number of results to return
|
413
508
|
score_threshold: Minimum similarity score threshold
|
414
|
-
|
509
|
+
|
415
510
|
Returns:
|
416
511
|
List of results with document content and metadata
|
417
512
|
"""
|
513
|
+
folder_paths = [Path(p) for p in folder_paths]
|
514
|
+
|
418
515
|
# Add detailed debugging for each folder
|
419
516
|
for folder_path in folder_paths:
|
420
|
-
folder_hash = self.
|
517
|
+
folder_hash = self.get_folder_hash(folder_path)
|
421
518
|
index_path = self._get_index_path(folder_path)
|
422
|
-
|
519
|
+
|
423
520
|
logger.info(f"Diagnostic for {folder_path}:")
|
424
521
|
logger.info(f" - Folder hash: {folder_hash}")
|
425
522
|
logger.info(f" - Index path: {index_path}")
|
426
523
|
logger.info(f" - Index exists on disk: {index_path.exists()}")
|
427
|
-
logger.info(
|
428
|
-
|
524
|
+
logger.info(
|
525
|
+
f" - Is in indexed_folders: {folder_hash in self.indexed_folders}"
|
526
|
+
)
|
527
|
+
|
429
528
|
if folder_hash in self.indexed_folders:
|
430
529
|
meta = self.indexed_folders[folder_hash]
|
431
|
-
logger.info(
|
432
|
-
|
530
|
+
logger.info(
|
531
|
+
f" - Metadata: file_count={meta.get('file_count', 0)}, chunk_count={meta.get('chunk_count', 0)}"
|
532
|
+
)
|
533
|
+
|
433
534
|
# Validate folders exist
|
434
535
|
valid_folder_paths = []
|
435
536
|
for path in folder_paths:
|
436
|
-
if
|
537
|
+
if path.exists() and path.is_dir():
|
437
538
|
valid_folder_paths.append(path)
|
438
539
|
else:
|
439
540
|
logger.warning(f"Skipping non-existent folder in search: {path}")
|
440
|
-
|
541
|
+
|
441
542
|
# If no valid folders, return empty results
|
442
543
|
if not valid_folder_paths:
|
443
544
|
logger.warning(f"No valid folders to search among: {folder_paths}")
|
444
545
|
return []
|
445
|
-
|
546
|
+
|
446
547
|
all_results = []
|
447
|
-
|
548
|
+
|
448
549
|
for folder_path in valid_folder_paths:
|
449
|
-
folder_hash = self.
|
450
|
-
|
550
|
+
folder_hash = self.get_folder_hash(folder_path)
|
551
|
+
|
451
552
|
# Skip folders that haven't been indexed
|
452
553
|
if folder_hash not in self.indexed_folders:
|
453
554
|
logger.warning(f"Folder {folder_path} has not been indexed")
|
454
555
|
continue
|
455
|
-
|
556
|
+
|
456
557
|
# Make sure the vector store is loaded
|
457
558
|
if folder_hash not in self.vector_stores:
|
458
559
|
index_path = self._get_index_path(folder_path)
|
@@ -461,74 +562,71 @@ class LocalEmbeddingManager:
|
|
461
562
|
str(index_path),
|
462
563
|
self.embeddings,
|
463
564
|
allow_dangerous_deserialization=True,
|
464
|
-
nomalize_L2=True
|
565
|
+
nomalize_L2=True,
|
465
566
|
)
|
466
567
|
except Exception as e:
|
467
568
|
logger.error(f"Error loading index for {folder_path}: {e}")
|
468
569
|
continue
|
469
|
-
|
570
|
+
|
470
571
|
# Search in this folder
|
471
572
|
vector_store = self.vector_stores[folder_hash]
|
472
|
-
|
573
|
+
|
473
574
|
try:
|
474
|
-
docs_with_scores = (
|
475
|
-
|
476
|
-
query,
|
477
|
-
k=limit
|
478
|
-
)
|
575
|
+
docs_with_scores = vector_store.similarity_search_with_relevance_scores(
|
576
|
+
query, k=limit
|
479
577
|
)
|
480
|
-
|
578
|
+
|
481
579
|
for doc, similarity in docs_with_scores:
|
482
580
|
# Skip results below the threshold
|
483
581
|
if similarity < score_threshold:
|
484
582
|
continue
|
485
|
-
|
583
|
+
|
486
584
|
result = {
|
487
585
|
"content": doc.page_content,
|
488
586
|
"metadata": doc.metadata,
|
489
587
|
"similarity": float(similarity),
|
490
|
-
"folder": folder_path
|
588
|
+
"folder": folder_path,
|
491
589
|
}
|
492
|
-
|
590
|
+
|
493
591
|
all_results.append(result)
|
494
592
|
except Exception as e:
|
495
593
|
logger.error(f"Error searching in {folder_path}: {e}")
|
496
|
-
|
594
|
+
|
497
595
|
# Sort by similarity (highest first)
|
498
596
|
all_results.sort(key=lambda x: x["similarity"], reverse=True)
|
499
|
-
|
597
|
+
|
500
598
|
# Limit to the requested number
|
501
599
|
return all_results[:limit]
|
502
|
-
|
600
|
+
|
503
601
|
def clear_cache(self):
|
504
602
|
"""Clear all cached vector stores from memory (not disk)"""
|
505
603
|
self.vector_stores.clear()
|
506
|
-
|
604
|
+
|
507
605
|
def get_indexed_folders_info(self) -> List[Dict[str, Any]]:
|
508
606
|
"""Get information about all indexed folders"""
|
509
607
|
info = []
|
510
|
-
|
608
|
+
|
511
609
|
for folder_hash, metadata in self.indexed_folders.items():
|
512
610
|
folder_info = metadata.copy()
|
513
|
-
|
611
|
+
|
514
612
|
# Add formatted last indexed time
|
515
613
|
if "last_indexed" in folder_info:
|
516
614
|
folder_info["last_indexed_formatted"] = datetime.fromtimestamp(
|
517
615
|
folder_info["last_indexed"]
|
518
616
|
).strftime("%Y-%m-%d %H:%M:%S")
|
519
|
-
|
617
|
+
|
520
618
|
# Check if index file exists
|
521
|
-
index_path = self._get_index_path(folder_info["path"])
|
619
|
+
index_path = self._get_index_path(Path(folder_info["path"]))
|
522
620
|
folder_info["index_exists"] = index_path.exists()
|
523
|
-
|
621
|
+
|
524
622
|
info.append(folder_info)
|
525
|
-
|
623
|
+
|
526
624
|
return info
|
527
625
|
|
528
626
|
|
529
627
|
class LocalSearchEngine(BaseSearchEngine):
|
530
628
|
"""Local document search engine with two-phase retrieval"""
|
531
|
-
|
629
|
+
|
532
630
|
def __init__(
|
533
631
|
self,
|
534
632
|
paths: List[str],
|
@@ -547,7 +645,7 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
547
645
|
):
|
548
646
|
"""
|
549
647
|
Initialize the local search engine.
|
550
|
-
|
648
|
+
|
551
649
|
Args:
|
552
650
|
paths: List of folder paths to search in
|
553
651
|
llm: Language model for relevance filtering
|
@@ -565,7 +663,7 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
565
663
|
"""
|
566
664
|
# Initialize the base search engine
|
567
665
|
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
568
|
-
|
666
|
+
|
569
667
|
# Validate folder paths
|
570
668
|
self.folder_paths = paths
|
571
669
|
self.valid_folder_paths = []
|
@@ -574,15 +672,19 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
574
672
|
self.valid_folder_paths.append(path)
|
575
673
|
else:
|
576
674
|
logger.warning(f"Folder not found or is not a directory: {path}")
|
577
|
-
|
675
|
+
|
578
676
|
# If no valid folders, log a clear message
|
579
677
|
if not self.valid_folder_paths and paths:
|
580
678
|
logger.warning(f"No valid folders found among: {paths}")
|
581
|
-
logger.warning(
|
582
|
-
|
679
|
+
logger.warning(
|
680
|
+
"This search engine will return no results until valid folders are configured"
|
681
|
+
)
|
682
|
+
|
583
683
|
self.max_results = max_results
|
584
|
-
self.collections = collections or {
|
585
|
-
|
684
|
+
self.collections = collections or {
|
685
|
+
"default": {"paths": paths, "description": "Default collection"}
|
686
|
+
}
|
687
|
+
|
586
688
|
# Initialize the embedding manager with only valid folders
|
587
689
|
self.embedding_manager = LocalEmbeddingManager(
|
588
690
|
embedding_model=embedding_model,
|
@@ -591,339 +693,400 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
591
693
|
ollama_base_url=ollama_base_url,
|
592
694
|
chunk_size=chunk_size,
|
593
695
|
chunk_overlap=chunk_overlap,
|
594
|
-
cache_dir=cache_dir
|
696
|
+
cache_dir=cache_dir,
|
595
697
|
)
|
596
|
-
|
698
|
+
|
597
699
|
# Index all folders
|
598
700
|
self._index_folders(force_reindex)
|
599
|
-
|
701
|
+
|
600
702
|
def _index_folders(self, force_reindex: bool = False):
|
601
703
|
"""Index all valid configured folders"""
|
602
704
|
indexed = []
|
603
705
|
failed = []
|
604
706
|
skipped = []
|
605
|
-
|
707
|
+
|
606
708
|
# Keep track of invalid folders
|
607
709
|
for folder in self.folder_paths:
|
608
710
|
if folder not in self.valid_folder_paths:
|
609
711
|
skipped.append(folder)
|
610
712
|
continue
|
611
|
-
|
713
|
+
|
612
714
|
success = self.embedding_manager.index_folder(folder, force_reindex)
|
613
715
|
if success:
|
614
716
|
indexed.append(folder)
|
615
717
|
else:
|
616
718
|
failed.append(folder)
|
617
|
-
|
719
|
+
|
618
720
|
if indexed:
|
619
|
-
logger.info(
|
620
|
-
|
721
|
+
logger.info(
|
722
|
+
f"Successfully indexed {len(indexed)} folders: {', '.join(indexed)}"
|
723
|
+
)
|
724
|
+
|
621
725
|
if failed:
|
622
|
-
logger.warning(
|
623
|
-
|
726
|
+
logger.warning(
|
727
|
+
f"Failed to index {len(failed)} folders: {', '.join(failed)}"
|
728
|
+
)
|
729
|
+
|
624
730
|
if skipped:
|
625
|
-
logger.warning(
|
626
|
-
|
627
|
-
|
731
|
+
logger.warning(
|
732
|
+
f"Skipped {len(skipped)} invalid folders: {', '.join(skipped)}"
|
733
|
+
)
|
734
|
+
|
735
|
+
def _get_previews(
|
736
|
+
self, query: str, collection_names: Optional[List[str]] = None
|
737
|
+
) -> List[Dict[str, Any]]:
|
628
738
|
"""
|
629
739
|
Get preview information for documents matching the query.
|
630
|
-
|
740
|
+
|
631
741
|
Args:
|
632
742
|
query: The search query
|
633
743
|
collection_names: Specific collections to search within (if None, search all)
|
634
|
-
|
744
|
+
|
635
745
|
Returns:
|
636
746
|
List of preview dictionaries
|
637
747
|
"""
|
638
748
|
# Determine which collections to search
|
639
749
|
if collection_names:
|
640
750
|
# Search only in specified collections
|
641
|
-
collections_to_search = {
|
642
|
-
|
751
|
+
collections_to_search = {
|
752
|
+
name: self.collections[name]
|
753
|
+
for name in collection_names
|
754
|
+
if name in self.collections
|
755
|
+
}
|
643
756
|
if not collections_to_search:
|
644
757
|
logger.warning(f"No valid collections found among: {collection_names}")
|
645
758
|
return []
|
646
759
|
else:
|
647
760
|
# Search in all collections
|
648
761
|
collections_to_search = self.collections
|
649
|
-
|
762
|
+
|
650
763
|
# Extract all folder paths from the collections to search
|
651
764
|
search_paths = []
|
652
765
|
for collection_config in collections_to_search.values():
|
653
766
|
if "paths" in collection_config:
|
654
767
|
search_paths.extend(collection_config["paths"])
|
655
|
-
|
656
|
-
logger.info(
|
657
|
-
|
768
|
+
|
769
|
+
logger.info(
|
770
|
+
f"Searching local documents in collections: {list(collections_to_search.keys())}"
|
771
|
+
)
|
772
|
+
|
658
773
|
# Filter out invalid paths
|
659
|
-
valid_search_paths = [
|
660
|
-
|
774
|
+
valid_search_paths = [
|
775
|
+
path for path in search_paths if path in self.valid_folder_paths
|
776
|
+
]
|
777
|
+
|
661
778
|
if not valid_search_paths:
|
662
|
-
logger.warning(
|
779
|
+
logger.warning(
|
780
|
+
f"No valid folders to search in collections: {list(collections_to_search.keys())}"
|
781
|
+
)
|
663
782
|
return []
|
664
|
-
|
783
|
+
|
665
784
|
# Search across the valid selected folders
|
666
785
|
raw_results = self.embedding_manager.search(
|
667
786
|
query=query,
|
668
787
|
folder_paths=valid_search_paths,
|
669
788
|
limit=self.max_results,
|
670
|
-
score_threshold=0.1 # Skip very low relevance results
|
789
|
+
score_threshold=0.1, # Skip very low relevance results
|
671
790
|
)
|
672
|
-
|
791
|
+
|
673
792
|
if not raw_results:
|
674
793
|
logger.info(f"No local documents found for query: {query}")
|
675
794
|
return []
|
676
|
-
|
795
|
+
|
677
796
|
# Convert to preview format
|
678
797
|
previews = []
|
679
798
|
for i, result in enumerate(raw_results):
|
680
799
|
# Create a unique ID
|
681
|
-
result_id =
|
682
|
-
|
800
|
+
result_id = (
|
801
|
+
f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
|
802
|
+
)
|
803
|
+
|
683
804
|
# Extract filename and path
|
684
|
-
source_path = result[
|
685
|
-
filename = result[
|
686
|
-
|
805
|
+
source_path = result["metadata"].get("source", "Unknown")
|
806
|
+
filename = result["metadata"].get("filename", os.path.basename(source_path))
|
807
|
+
|
687
808
|
# Create preview snippet (first ~200 chars of content)
|
688
|
-
snippet =
|
689
|
-
|
809
|
+
snippet = (
|
810
|
+
result["content"][:200] + "..."
|
811
|
+
if len(result["content"]) > 200
|
812
|
+
else result["content"]
|
813
|
+
)
|
814
|
+
|
690
815
|
# Determine which collection this document belongs to
|
691
816
|
collection_name = "Unknown"
|
692
|
-
folder_path = result[
|
817
|
+
folder_path = result["folder"]
|
693
818
|
for name, collection in self.collections.items():
|
694
|
-
if any(
|
695
|
-
|
819
|
+
if any(
|
820
|
+
folder_path.is_relative_to(path)
|
821
|
+
for path in collection.get("paths", [])
|
822
|
+
):
|
696
823
|
break
|
697
|
-
|
824
|
+
|
698
825
|
# Format the preview
|
699
826
|
preview = {
|
700
827
|
"id": result_id,
|
701
828
|
"title": filename,
|
702
829
|
"snippet": snippet,
|
703
830
|
"link": source_path,
|
704
|
-
"similarity": result[
|
705
|
-
"folder": folder_path,
|
831
|
+
"similarity": result["similarity"],
|
832
|
+
"folder": folder_path.as_posix(),
|
706
833
|
"collection": collection_name,
|
707
|
-
"collection_description": self.collections.get(collection_name, {}).get(
|
708
|
-
|
709
|
-
|
834
|
+
"collection_description": self.collections.get(collection_name, {}).get(
|
835
|
+
"description", ""
|
836
|
+
),
|
837
|
+
"_full_content": result["content"], # Store full content for later
|
838
|
+
"_metadata": result["metadata"], # Store metadata for later
|
710
839
|
}
|
711
|
-
|
840
|
+
|
712
841
|
previews.append(preview)
|
713
|
-
|
842
|
+
|
714
843
|
logger.info(f"Found {len(previews)} local document matches")
|
715
844
|
return previews
|
716
|
-
|
717
|
-
def _get_full_content(
|
845
|
+
|
846
|
+
def _get_full_content(
|
847
|
+
self, relevant_items: List[Dict[str, Any]]
|
848
|
+
) -> List[Dict[str, Any]]:
|
718
849
|
"""
|
719
850
|
Get full content for the relevant documents.
|
720
851
|
For local search, the full content is already available.
|
721
|
-
|
852
|
+
|
722
853
|
Args:
|
723
854
|
relevant_items: List of relevant preview dictionaries
|
724
|
-
|
855
|
+
|
725
856
|
Returns:
|
726
857
|
List of result dictionaries with full content
|
727
858
|
"""
|
728
859
|
# Check if we should add full content
|
729
|
-
if
|
860
|
+
if (
|
861
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
862
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
863
|
+
):
|
730
864
|
logger.info("Snippet-only mode, skipping full content addition")
|
731
865
|
return relevant_items
|
732
|
-
|
866
|
+
|
733
867
|
# For local search, we already have the full content
|
734
868
|
results = []
|
735
869
|
for item in relevant_items:
|
736
870
|
# Create a copy with full content
|
737
871
|
result = item.copy()
|
738
|
-
|
872
|
+
|
739
873
|
# Add full content if we have it
|
740
874
|
if "_full_content" in item:
|
741
875
|
result["content"] = item["_full_content"]
|
742
876
|
result["full_content"] = item["_full_content"]
|
743
|
-
|
877
|
+
|
744
878
|
# Remove temporary fields
|
745
879
|
if "_full_content" in result:
|
746
880
|
del result["_full_content"]
|
747
|
-
|
881
|
+
|
748
882
|
# Add metadata if we have it
|
749
883
|
if "_metadata" in item:
|
750
884
|
result["document_metadata"] = item["_metadata"]
|
751
|
-
|
885
|
+
|
752
886
|
# Remove temporary fields
|
753
887
|
if "_metadata" in result:
|
754
888
|
del result["_metadata"]
|
755
|
-
|
889
|
+
|
756
890
|
results.append(result)
|
757
|
-
|
891
|
+
|
758
892
|
return results
|
759
|
-
|
760
|
-
def run(
|
893
|
+
|
894
|
+
def run(
|
895
|
+
self, query: str, collection_names: Optional[List[str]] = None
|
896
|
+
) -> List[Dict[str, Any]]:
|
761
897
|
"""
|
762
898
|
Execute a search using the two-phase approach.
|
763
|
-
|
899
|
+
|
764
900
|
Args:
|
765
901
|
query: The search query
|
766
902
|
collection_names: Specific collections to search within (if None, search all)
|
767
|
-
|
903
|
+
|
768
904
|
Returns:
|
769
905
|
List of search result dictionaries with full content
|
770
906
|
"""
|
771
|
-
logger.info(
|
772
|
-
|
907
|
+
logger.info("---Execute a search using Local Documents---")
|
908
|
+
|
773
909
|
# Check if we have any special collection parameters in the query
|
774
910
|
collection_prefix = "collection:"
|
775
911
|
remaining_query = query
|
776
912
|
specified_collections = []
|
777
|
-
|
913
|
+
|
778
914
|
# Parse query for collection specifications like "collection:research_papers query terms"
|
779
915
|
query_parts = query.split()
|
780
916
|
for part in query_parts:
|
781
917
|
if part.lower().startswith(collection_prefix):
|
782
|
-
collection_name = part[len(collection_prefix):].strip()
|
918
|
+
collection_name = part[len(collection_prefix) :].strip()
|
783
919
|
if collection_name in self.collections:
|
784
920
|
specified_collections.append(collection_name)
|
785
921
|
# Remove this part from the query
|
786
922
|
remaining_query = remaining_query.replace(part, "", 1).strip()
|
787
|
-
|
923
|
+
|
788
924
|
# If collections were specified in the query, they override the parameter
|
789
925
|
if specified_collections:
|
790
926
|
collection_names = specified_collections
|
791
927
|
query = remaining_query
|
792
|
-
|
928
|
+
|
793
929
|
# Phase 1: Get previews (with collection filtering)
|
794
930
|
previews = self._get_previews(query, collection_names)
|
795
|
-
|
931
|
+
|
796
932
|
if not previews:
|
797
933
|
return []
|
798
|
-
|
934
|
+
|
799
935
|
# Phase 2: Filter for relevance
|
800
936
|
relevant_items = self._filter_for_relevance(previews, query)
|
801
|
-
|
937
|
+
|
802
938
|
if not relevant_items:
|
803
939
|
return []
|
804
|
-
|
940
|
+
|
805
941
|
# Phase 3: Get full content for relevant items
|
806
|
-
if
|
942
|
+
if (
|
943
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
944
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
945
|
+
):
|
807
946
|
logger.info("Returning snippet-only results as per config")
|
808
947
|
results = relevant_items
|
809
948
|
else:
|
810
949
|
results = self._get_full_content(relevant_items)
|
811
|
-
|
950
|
+
|
812
951
|
# Clean up temporary data
|
813
952
|
self.embedding_manager.clear_cache()
|
814
|
-
|
953
|
+
|
815
954
|
return results
|
816
|
-
|
955
|
+
|
817
956
|
def get_collections_info(self) -> List[Dict[str, Any]]:
|
818
957
|
"""
|
819
958
|
Get information about all collections, including indexing status.
|
820
|
-
|
959
|
+
|
821
960
|
Returns:
|
822
961
|
List of collection information dictionaries
|
823
962
|
"""
|
824
963
|
collections_info = []
|
825
|
-
|
964
|
+
|
826
965
|
for name, collection in self.collections.items():
|
827
966
|
paths = collection.get("paths", [])
|
967
|
+
paths = [Path(p) for p in paths]
|
828
968
|
description = collection.get("description", "")
|
829
|
-
|
969
|
+
|
830
970
|
# Get indexing information for each path
|
831
971
|
paths_info = []
|
832
972
|
for path in paths:
|
833
973
|
# Check if folder exists
|
834
|
-
exists =
|
835
|
-
|
974
|
+
exists = path.exists() and path.is_dir()
|
975
|
+
|
836
976
|
# Check if folder is indexed
|
837
|
-
folder_hash = self.embedding_manager.
|
977
|
+
folder_hash = self.embedding_manager.get_folder_hash(path)
|
838
978
|
indexed = folder_hash in self.embedding_manager.indexed_folders
|
839
|
-
|
979
|
+
|
840
980
|
# Get index details if available
|
841
981
|
index_info = {}
|
842
982
|
if indexed:
|
843
|
-
index_info = self.embedding_manager.indexed_folders[
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
983
|
+
index_info = self.embedding_manager.indexed_folders[
|
984
|
+
folder_hash
|
985
|
+
].copy()
|
986
|
+
|
987
|
+
paths_info.append(
|
988
|
+
{
|
989
|
+
"path": path,
|
990
|
+
"exists": exists,
|
991
|
+
"indexed": indexed,
|
992
|
+
"index_info": index_info,
|
993
|
+
}
|
994
|
+
)
|
995
|
+
|
996
|
+
collections_info.append(
|
997
|
+
{
|
998
|
+
"name": name,
|
999
|
+
"description": description,
|
1000
|
+
"paths": paths,
|
1001
|
+
"paths_info": paths_info,
|
1002
|
+
"document_count": sum(
|
1003
|
+
info.get("index_info", {}).get("file_count", 0)
|
1004
|
+
for info in paths_info
|
1005
|
+
),
|
1006
|
+
"chunk_count": sum(
|
1007
|
+
info.get("index_info", {}).get("chunk_count", 0)
|
1008
|
+
for info in paths_info
|
1009
|
+
),
|
1010
|
+
"all_indexed": all(
|
1011
|
+
info["indexed"] for info in paths_info if info["exists"]
|
1012
|
+
),
|
1013
|
+
}
|
1014
|
+
)
|
1015
|
+
|
862
1016
|
return collections_info
|
863
|
-
|
1017
|
+
|
864
1018
|
def reindex_collection(self, collection_name: str) -> bool:
|
865
1019
|
"""
|
866
1020
|
Reindex a specific collection.
|
867
|
-
|
1021
|
+
|
868
1022
|
Args:
|
869
1023
|
collection_name: Name of the collection to reindex
|
870
|
-
|
1024
|
+
|
871
1025
|
Returns:
|
872
1026
|
True if reindexing was successful, False otherwise
|
873
1027
|
"""
|
874
1028
|
if collection_name not in self.collections:
|
875
1029
|
logger.error(f"Collection '{collection_name}' not found")
|
876
1030
|
return False
|
877
|
-
|
1031
|
+
|
878
1032
|
paths = self.collections[collection_name].get("paths", [])
|
879
1033
|
success = True
|
880
|
-
|
1034
|
+
|
881
1035
|
for path in paths:
|
882
1036
|
if not self.embedding_manager.index_folder(path, force_reindex=True):
|
883
1037
|
success = False
|
884
|
-
|
1038
|
+
|
885
1039
|
return success
|
886
|
-
|
1040
|
+
|
887
1041
|
@classmethod
|
888
|
-
def from_config(
|
1042
|
+
def from_config(
|
1043
|
+
cls, config_dict: Dict[str, Any], llm: Optional[BaseLLM] = None
|
1044
|
+
) -> "LocalSearchEngine":
|
889
1045
|
"""
|
890
1046
|
Create a LocalSearchEngine instance from a configuration dictionary.
|
891
|
-
|
1047
|
+
|
892
1048
|
Args:
|
893
1049
|
config_dict: Configuration dictionary
|
894
1050
|
llm: Language model for relevance filtering
|
895
|
-
|
1051
|
+
|
896
1052
|
Returns:
|
897
1053
|
Initialized LocalSearchEngine instance
|
898
1054
|
"""
|
899
1055
|
# Required parameters
|
900
1056
|
folder_paths = []
|
901
1057
|
collections = config_dict.get("collections", {})
|
902
|
-
|
1058
|
+
|
903
1059
|
# Extract all folder paths from collections
|
904
1060
|
for collection_config in collections.values():
|
905
1061
|
if "paths" in collection_config:
|
906
1062
|
folder_paths.extend(collection_config["paths"])
|
907
|
-
|
1063
|
+
|
908
1064
|
# Fall back to folder_paths if no collections defined
|
909
1065
|
if not folder_paths:
|
910
1066
|
folder_paths = config_dict.get("folder_paths", [])
|
911
1067
|
# Create a default collection if using folder_paths
|
912
1068
|
if folder_paths:
|
913
|
-
collections = {
|
914
|
-
|
1069
|
+
collections = {
|
1070
|
+
"default": {
|
1071
|
+
"paths": folder_paths,
|
1072
|
+
"description": "Default collection",
|
1073
|
+
}
|
1074
|
+
}
|
1075
|
+
|
915
1076
|
# Optional parameters with defaults
|
916
1077
|
max_results = config_dict.get("max_results", 10)
|
917
1078
|
max_filtered_results = config_dict.get("max_filtered_results")
|
918
1079
|
embedding_model = config_dict.get("embedding_model", "all-MiniLM-L6-v2")
|
919
1080
|
embedding_device = config_dict.get("embedding_device", "cpu")
|
920
|
-
embedding_model_type = config_dict.get(
|
1081
|
+
embedding_model_type = config_dict.get(
|
1082
|
+
"embedding_model_type", "sentence_transformers"
|
1083
|
+
)
|
921
1084
|
ollama_base_url = config_dict.get("ollama_base_url")
|
922
1085
|
force_reindex = config_dict.get("force_reindex", False)
|
923
1086
|
chunk_size = config_dict.get("chunk_size", 1000)
|
924
1087
|
chunk_overlap = config_dict.get("chunk_overlap", 200)
|
925
1088
|
cache_dir = config_dict.get("cache_dir", ".cache/local_search")
|
926
|
-
|
1089
|
+
|
927
1090
|
return cls(
|
928
1091
|
paths=folder_paths,
|
929
1092
|
collections=collections,
|
@@ -937,5 +1100,5 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
937
1100
|
force_reindex=force_reindex,
|
938
1101
|
chunk_size=chunk_size,
|
939
1102
|
chunk_overlap=chunk_overlap,
|
940
|
-
cache_dir=cache_dir
|
1103
|
+
cache_dir=cache_dir,
|
941
1104
|
)
|