local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +1 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
- local_deep_research/advanced_search_system/findings/repository.py +0 -3
- local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
- local_deep_research/api/benchmark_functions.py +288 -0
- local_deep_research/api/research_functions.py +8 -4
- local_deep_research/benchmarks/README.md +162 -0
- local_deep_research/benchmarks/__init__.py +51 -0
- local_deep_research/benchmarks/benchmark_functions.py +353 -0
- local_deep_research/benchmarks/cli/__init__.py +16 -0
- local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
- local_deep_research/benchmarks/cli.py +347 -0
- local_deep_research/benchmarks/comparison/__init__.py +12 -0
- local_deep_research/benchmarks/comparison/evaluator.py +768 -0
- local_deep_research/benchmarks/datasets/__init__.py +53 -0
- local_deep_research/benchmarks/datasets/base.py +295 -0
- local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
- local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
- local_deep_research/benchmarks/datasets/utils.py +116 -0
- local_deep_research/benchmarks/datasets.py +31 -0
- local_deep_research/benchmarks/efficiency/__init__.py +14 -0
- local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
- local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
- local_deep_research/benchmarks/evaluators/__init__.py +18 -0
- local_deep_research/benchmarks/evaluators/base.py +74 -0
- local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
- local_deep_research/benchmarks/evaluators/composite.py +121 -0
- local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
- local_deep_research/benchmarks/graders.py +410 -0
- local_deep_research/benchmarks/metrics/README.md +80 -0
- local_deep_research/benchmarks/metrics/__init__.py +24 -0
- local_deep_research/benchmarks/metrics/calculation.py +385 -0
- local_deep_research/benchmarks/metrics/reporting.py +155 -0
- local_deep_research/benchmarks/metrics/visualization.py +205 -0
- local_deep_research/benchmarks/metrics.py +11 -0
- local_deep_research/benchmarks/optimization/__init__.py +32 -0
- local_deep_research/benchmarks/optimization/api.py +274 -0
- local_deep_research/benchmarks/optimization/metrics.py +20 -0
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
- local_deep_research/benchmarks/runners.py +434 -0
- local_deep_research/benchmarks/templates.py +65 -0
- local_deep_research/config/llm_config.py +26 -23
- local_deep_research/config/search_config.py +1 -5
- local_deep_research/defaults/default_settings.json +108 -7
- local_deep_research/search_system.py +16 -8
- local_deep_research/utilities/db_utils.py +3 -6
- local_deep_research/utilities/es_utils.py +441 -0
- local_deep_research/utilities/log_utils.py +36 -0
- local_deep_research/utilities/search_utilities.py +8 -9
- local_deep_research/web/app.py +15 -10
- local_deep_research/web/app_factory.py +9 -12
- local_deep_research/web/database/migrations.py +8 -5
- local_deep_research/web/database/models.py +20 -0
- local_deep_research/web/database/schema_upgrade.py +5 -8
- local_deep_research/web/models/database.py +15 -18
- local_deep_research/web/routes/benchmark_routes.py +427 -0
- local_deep_research/web/routes/research_routes.py +13 -17
- local_deep_research/web/routes/settings_routes.py +264 -67
- local_deep_research/web/services/research_service.py +58 -73
- local_deep_research/web/services/settings_manager.py +1 -4
- local_deep_research/web/services/settings_service.py +4 -6
- local_deep_research/web/static/css/styles.css +12 -0
- local_deep_research/web/static/js/components/logpanel.js +164 -155
- local_deep_research/web/static/js/components/research.js +44 -3
- local_deep_research/web/static/js/components/settings.js +27 -0
- local_deep_research/web/static/js/services/socket.js +47 -0
- local_deep_research/web_search_engines/default_search_engines.py +38 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
- local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
- local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
- local_deep_research/web_search_engines/search_engine_base.py +22 -5
- local_deep_research/web_search_engines/search_engine_factory.py +30 -11
- local_deep_research/web_search_engines/search_engines_config.py +14 -1
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/METADATA +10 -2
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/RECORD +93 -51
- local_deep_research/app.py +0 -8
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/WHEEL +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,343 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
from elasticsearch import Elasticsearch
|
6
|
+
from langchain_core.language_models import BaseLLM
|
7
|
+
|
8
|
+
from ...config import search_config
|
9
|
+
from ..search_engine_base import BaseSearchEngine
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
class ElasticsearchSearchEngine(BaseSearchEngine):
|
15
|
+
"""Elasticsearch search engine implementation with two-phase approach"""
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
hosts: List[str] = ["http://localhost:9200"],
|
20
|
+
index_name: str = "documents",
|
21
|
+
username: Optional[str] = None,
|
22
|
+
password: Optional[str] = None,
|
23
|
+
api_key: Optional[str] = None,
|
24
|
+
cloud_id: Optional[str] = None,
|
25
|
+
max_results: int = 10,
|
26
|
+
highlight_fields: List[str] = ["content", "title"],
|
27
|
+
search_fields: List[str] = ["content", "title"],
|
28
|
+
filter_query: Optional[Dict[str, Any]] = None,
|
29
|
+
llm: Optional[BaseLLM] = None,
|
30
|
+
max_filtered_results: Optional[int] = None,
|
31
|
+
):
|
32
|
+
"""
|
33
|
+
Initialize the Elasticsearch search engine.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
hosts: List of Elasticsearch hosts
|
37
|
+
index_name: Name of the index to search
|
38
|
+
username: Optional username for authentication
|
39
|
+
password: Optional password for authentication
|
40
|
+
api_key: Optional API key for authentication
|
41
|
+
cloud_id: Optional Elastic Cloud ID
|
42
|
+
max_results: Maximum number of search results
|
43
|
+
highlight_fields: Fields to highlight in search results
|
44
|
+
search_fields: Fields to search in
|
45
|
+
filter_query: Optional filter query in Elasticsearch DSL format
|
46
|
+
llm: Language model for relevance filtering
|
47
|
+
max_filtered_results: Maximum number of results to keep after filtering
|
48
|
+
"""
|
49
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
50
|
+
super().__init__(
|
51
|
+
llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
|
52
|
+
)
|
53
|
+
|
54
|
+
self.index_name = index_name
|
55
|
+
self.highlight_fields = highlight_fields
|
56
|
+
self.search_fields = search_fields
|
57
|
+
self.filter_query = filter_query or {}
|
58
|
+
|
59
|
+
# Initialize the Elasticsearch client
|
60
|
+
es_args = {}
|
61
|
+
|
62
|
+
# Basic authentication
|
63
|
+
if username and password:
|
64
|
+
es_args["basic_auth"] = (username, password)
|
65
|
+
|
66
|
+
# API key authentication
|
67
|
+
if api_key:
|
68
|
+
es_args["api_key"] = api_key
|
69
|
+
|
70
|
+
# Cloud ID for Elastic Cloud
|
71
|
+
if cloud_id:
|
72
|
+
es_args["cloud_id"] = cloud_id
|
73
|
+
|
74
|
+
# Connect to Elasticsearch
|
75
|
+
self.client = Elasticsearch(hosts, **es_args)
|
76
|
+
|
77
|
+
# Verify connection
|
78
|
+
try:
|
79
|
+
info = self.client.info()
|
80
|
+
logger.info(f"Connected to Elasticsearch cluster: {info.get('cluster_name')}")
|
81
|
+
logger.info(f"Elasticsearch version: {info.get('version', {}).get('number')}")
|
82
|
+
except Exception as e:
|
83
|
+
logger.error(f"Failed to connect to Elasticsearch: {str(e)}")
|
84
|
+
raise ConnectionError(f"Could not connect to Elasticsearch: {str(e)}")
|
85
|
+
|
86
|
+
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
87
|
+
"""
|
88
|
+
Get preview information for Elasticsearch documents.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
query: The search query
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
List of preview dictionaries
|
95
|
+
"""
|
96
|
+
logger.info(f"Getting document previews from Elasticsearch with query: {query}")
|
97
|
+
|
98
|
+
try:
|
99
|
+
# Build the search query
|
100
|
+
search_query = {
|
101
|
+
"query": {
|
102
|
+
"multi_match": {
|
103
|
+
"query": query,
|
104
|
+
"fields": self.search_fields,
|
105
|
+
"type": "best_fields",
|
106
|
+
"tie_breaker": 0.3,
|
107
|
+
}
|
108
|
+
},
|
109
|
+
"highlight": {
|
110
|
+
"fields": {field: {} for field in self.highlight_fields},
|
111
|
+
"pre_tags": ["<em>"],
|
112
|
+
"post_tags": ["</em>"],
|
113
|
+
},
|
114
|
+
"size": self.max_results,
|
115
|
+
}
|
116
|
+
|
117
|
+
# Add filter if provided
|
118
|
+
if self.filter_query:
|
119
|
+
search_query["query"] = {
|
120
|
+
"bool": {
|
121
|
+
"must": search_query["query"],
|
122
|
+
"filter": self.filter_query
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
# Execute the search
|
127
|
+
response = self.client.search(
|
128
|
+
index=self.index_name,
|
129
|
+
body=search_query,
|
130
|
+
)
|
131
|
+
|
132
|
+
# Process the search results
|
133
|
+
hits = response.get("hits", {}).get("hits", [])
|
134
|
+
|
135
|
+
# Format results as previews with basic information
|
136
|
+
previews = []
|
137
|
+
for hit in hits:
|
138
|
+
source = hit.get("_source", {})
|
139
|
+
highlight = hit.get("highlight", {})
|
140
|
+
|
141
|
+
# Extract highlighted snippets or fall back to original content
|
142
|
+
snippet = ""
|
143
|
+
for field in self.highlight_fields:
|
144
|
+
if field in highlight and highlight[field]:
|
145
|
+
# Join all highlights for this field
|
146
|
+
field_snippets = " ... ".join(highlight[field])
|
147
|
+
snippet += field_snippets + " "
|
148
|
+
|
149
|
+
# If no highlights, use a portion of the content
|
150
|
+
if not snippet and "content" in source:
|
151
|
+
content = source.get("content", "")
|
152
|
+
snippet = content[:250] + "..." if len(content) > 250 else content
|
153
|
+
|
154
|
+
# Create preview object
|
155
|
+
preview = {
|
156
|
+
"id": hit.get("_id", ""),
|
157
|
+
"title": source.get("title", "Untitled Document"),
|
158
|
+
"link": source.get("url", "") or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
|
159
|
+
"snippet": snippet.strip(),
|
160
|
+
"score": hit.get("_score", 0),
|
161
|
+
"_index": hit.get("_index", self.index_name),
|
162
|
+
}
|
163
|
+
|
164
|
+
previews.append(preview)
|
165
|
+
|
166
|
+
logger.info(f"Found {len(previews)} preview results from Elasticsearch")
|
167
|
+
return previews
|
168
|
+
|
169
|
+
except Exception as e:
|
170
|
+
logger.error(f"Error getting Elasticsearch previews: {str(e)}")
|
171
|
+
return []
|
172
|
+
|
173
|
+
def _get_full_content(
|
174
|
+
self, relevant_items: List[Dict[str, Any]]
|
175
|
+
) -> List[Dict[str, Any]]:
|
176
|
+
"""
|
177
|
+
Get full content for the relevant Elasticsearch documents.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
relevant_items: List of relevant preview dictionaries
|
181
|
+
|
182
|
+
Returns:
|
183
|
+
List of result dictionaries with full content
|
184
|
+
"""
|
185
|
+
# Check if we should get full content
|
186
|
+
if (
|
187
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
188
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
189
|
+
):
|
190
|
+
logger.info("Snippet-only mode, skipping full content retrieval")
|
191
|
+
return relevant_items
|
192
|
+
|
193
|
+
logger.info("Getting full content for relevant Elasticsearch documents")
|
194
|
+
|
195
|
+
results = []
|
196
|
+
for item in relevant_items:
|
197
|
+
# Start with the preview data
|
198
|
+
result = item.copy()
|
199
|
+
|
200
|
+
# Get the document ID
|
201
|
+
doc_id = item.get("id")
|
202
|
+
if not doc_id:
|
203
|
+
# Skip items without ID
|
204
|
+
logger.warning(f"Skipping item without ID: {item}")
|
205
|
+
results.append(result)
|
206
|
+
continue
|
207
|
+
|
208
|
+
try:
|
209
|
+
# Fetch the full document
|
210
|
+
doc_response = self.client.get(
|
211
|
+
index=self.index_name,
|
212
|
+
id=doc_id,
|
213
|
+
)
|
214
|
+
|
215
|
+
# Get the source document
|
216
|
+
source = doc_response.get("_source", {})
|
217
|
+
|
218
|
+
# Add full content to the result
|
219
|
+
result["content"] = source.get("content", result.get("snippet", ""))
|
220
|
+
result["full_content"] = source.get("content", "")
|
221
|
+
|
222
|
+
# Add metadata from source
|
223
|
+
for key, value in source.items():
|
224
|
+
if key not in result and key not in ["content"]:
|
225
|
+
result[key] = value
|
226
|
+
|
227
|
+
except Exception as e:
|
228
|
+
logger.error(f"Error fetching full content for document {doc_id}: {str(e)}")
|
229
|
+
# Keep the preview data if we can't get the full content
|
230
|
+
|
231
|
+
results.append(result)
|
232
|
+
|
233
|
+
return results
|
234
|
+
|
235
|
+
def search_by_query_string(self, query_string: str) -> List[Dict[str, Any]]:
|
236
|
+
"""
|
237
|
+
Perform a search using Elasticsearch Query String syntax.
|
238
|
+
|
239
|
+
Args:
|
240
|
+
query_string: The query in Elasticsearch Query String syntax
|
241
|
+
|
242
|
+
Returns:
|
243
|
+
List of search results
|
244
|
+
"""
|
245
|
+
try:
|
246
|
+
# Build the search query
|
247
|
+
search_query = {
|
248
|
+
"query": {
|
249
|
+
"query_string": {
|
250
|
+
"query": query_string,
|
251
|
+
"fields": self.search_fields,
|
252
|
+
}
|
253
|
+
},
|
254
|
+
"highlight": {
|
255
|
+
"fields": {field: {} for field in self.highlight_fields},
|
256
|
+
"pre_tags": ["<em>"],
|
257
|
+
"post_tags": ["</em>"],
|
258
|
+
},
|
259
|
+
"size": self.max_results,
|
260
|
+
}
|
261
|
+
|
262
|
+
# Execute the search
|
263
|
+
response = self.client.search(
|
264
|
+
index=self.index_name,
|
265
|
+
body=search_query,
|
266
|
+
)
|
267
|
+
|
268
|
+
# Process and return the results
|
269
|
+
previews = self._process_es_response(response)
|
270
|
+
return self._get_full_content(previews)
|
271
|
+
|
272
|
+
except Exception as e:
|
273
|
+
logger.error(f"Error in query_string search: {str(e)}")
|
274
|
+
return []
|
275
|
+
|
276
|
+
def search_by_dsl(self, query_dsl: Dict[str, Any]) -> List[Dict[str, Any]]:
|
277
|
+
"""
|
278
|
+
Perform a search using Elasticsearch DSL (Query Domain Specific Language).
|
279
|
+
|
280
|
+
Args:
|
281
|
+
query_dsl: The query in Elasticsearch DSL format
|
282
|
+
|
283
|
+
Returns:
|
284
|
+
List of search results
|
285
|
+
"""
|
286
|
+
try:
|
287
|
+
# Execute the search with the provided DSL
|
288
|
+
response = self.client.search(
|
289
|
+
index=self.index_name,
|
290
|
+
body=query_dsl,
|
291
|
+
)
|
292
|
+
|
293
|
+
# Process and return the results
|
294
|
+
previews = self._process_es_response(response)
|
295
|
+
return self._get_full_content(previews)
|
296
|
+
|
297
|
+
except Exception as e:
|
298
|
+
logger.error(f"Error in DSL search: {str(e)}")
|
299
|
+
return []
|
300
|
+
|
301
|
+
def _process_es_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
302
|
+
"""
|
303
|
+
Process Elasticsearch response into preview dictionaries.
|
304
|
+
|
305
|
+
Args:
|
306
|
+
response: Elasticsearch response dictionary
|
307
|
+
|
308
|
+
Returns:
|
309
|
+
List of preview dictionaries
|
310
|
+
"""
|
311
|
+
hits = response.get("hits", {}).get("hits", [])
|
312
|
+
|
313
|
+
# Format results as previews
|
314
|
+
previews = []
|
315
|
+
for hit in hits:
|
316
|
+
source = hit.get("_source", {})
|
317
|
+
highlight = hit.get("highlight", {})
|
318
|
+
|
319
|
+
# Extract highlighted snippets or fall back to original content
|
320
|
+
snippet = ""
|
321
|
+
for field in self.highlight_fields:
|
322
|
+
if field in highlight and highlight[field]:
|
323
|
+
field_snippets = " ... ".join(highlight[field])
|
324
|
+
snippet += field_snippets + " "
|
325
|
+
|
326
|
+
# If no highlights, use a portion of the content
|
327
|
+
if not snippet and "content" in source:
|
328
|
+
content = source.get("content", "")
|
329
|
+
snippet = content[:250] + "..." if len(content) > 250 else content
|
330
|
+
|
331
|
+
# Create preview object
|
332
|
+
preview = {
|
333
|
+
"id": hit.get("_id", ""),
|
334
|
+
"title": source.get("title", "Untitled Document"),
|
335
|
+
"link": source.get("url", "") or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
|
336
|
+
"snippet": snippet.strip(),
|
337
|
+
"score": hit.get("_score", 0),
|
338
|
+
"_index": hit.get("_index", self.index_name),
|
339
|
+
}
|
340
|
+
|
341
|
+
previews.append(preview)
|
342
|
+
|
343
|
+
return previews
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import logging
|
2
|
-
import os
|
3
2
|
import random
|
4
3
|
import time
|
5
4
|
from typing import Any, Dict, List, Optional
|
@@ -88,17 +87,26 @@ class GooglePSESearchEngine(BaseSearchEngine):
|
|
88
87
|
# Region/Country setting
|
89
88
|
self.region = region
|
90
89
|
|
91
|
-
# API key and Search Engine ID
|
92
|
-
|
93
|
-
|
90
|
+
# API key and Search Engine ID - check params, env vars, or database
|
91
|
+
from ...utilities.db_utils import get_db_setting
|
92
|
+
|
93
|
+
self.api_key = api_key
|
94
|
+
if not self.api_key:
|
95
|
+
self.api_key = get_db_setting("search.engine.web.google_pse.api_key")
|
96
|
+
|
97
|
+
self.search_engine_id = search_engine_id
|
98
|
+
if not self.search_engine_id:
|
99
|
+
self.search_engine_id = get_db_setting(
|
100
|
+
"search.engine.web.google_pse.engine_id"
|
101
|
+
)
|
94
102
|
|
95
103
|
if not self.api_key:
|
96
104
|
raise ValueError(
|
97
|
-
"Google API key is required. Set it in the GOOGLE_PSE_API_KEY environment variable."
|
105
|
+
"Google API key is required. Set it in the UI settings, use the api_key parameter, or set the GOOGLE_PSE_API_KEY environment variable."
|
98
106
|
)
|
99
107
|
if not self.search_engine_id:
|
100
108
|
raise ValueError(
|
101
|
-
"Google Search Engine ID is required. Set it in the GOOGLE_PSE_ENGINE_ID environment variable."
|
109
|
+
"Google Search Engine ID is required. Set it in the UI settings, use the search_engine_id parameter, or set the GOOGLE_PSE_ENGINE_ID environment variable."
|
102
110
|
)
|
103
111
|
|
104
112
|
# Validate connection and credentials
|
@@ -1,6 +1,5 @@
|
|
1
1
|
import hashlib
|
2
2
|
import json
|
3
|
-
import logging
|
4
3
|
import os
|
5
4
|
import time
|
6
5
|
import uuid
|
@@ -29,16 +28,13 @@ from langchain_community.vectorstores import FAISS
|
|
29
28
|
from langchain_core.documents import Document
|
30
29
|
from langchain_core.language_models import BaseLLM
|
31
30
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
31
|
+
from loguru import logger
|
32
32
|
|
33
33
|
from ...config import search_config
|
34
34
|
from ...utilities.db_utils import get_db_setting
|
35
35
|
from ...utilities.url_utils import normalize_url
|
36
36
|
from ..search_engine_base import BaseSearchEngine
|
37
37
|
|
38
|
-
# Setup logging
|
39
|
-
logging.basicConfig(level=logging.INFO)
|
40
|
-
logger = logging.getLogger(__name__)
|
41
|
-
|
42
38
|
|
43
39
|
def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
|
44
40
|
"""Get an appropriate document loader for a file based on its extension"""
|
@@ -62,8 +58,8 @@ def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
|
|
62
58
|
# Try the text loader as a fallback for unknown extensions
|
63
59
|
logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
|
64
60
|
return TextLoader(str(file_path), encoding="utf-8")
|
65
|
-
except Exception
|
66
|
-
logger.
|
61
|
+
except Exception:
|
62
|
+
logger.exception(f"Error creating loader for {file_path}")
|
67
63
|
return None
|
68
64
|
|
69
65
|
|
@@ -94,8 +90,8 @@ def _load_document(file_path: Path) -> List[Document]:
|
|
94
90
|
doc.metadata["source"] = str(file_path)
|
95
91
|
doc.metadata["filename"] = file_path.name
|
96
92
|
|
97
|
-
except Exception
|
98
|
-
logger.
|
93
|
+
except Exception:
|
94
|
+
logger.exception(f"Error loading {file_path}")
|
99
95
|
return []
|
100
96
|
|
101
97
|
return docs
|
@@ -197,8 +193,8 @@ class LocalEmbeddingManager:
|
|
197
193
|
model_name=self.embedding_model,
|
198
194
|
model_kwargs={"device": self.embedding_device},
|
199
195
|
)
|
200
|
-
except Exception
|
201
|
-
logger.
|
196
|
+
except Exception:
|
197
|
+
logger.exception("Error initializing embeddings")
|
202
198
|
logger.warning(
|
203
199
|
"Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2"
|
204
200
|
)
|
@@ -226,8 +222,8 @@ class LocalEmbeddingManager:
|
|
226
222
|
logger.info(f"Loaded index with {doc_count} document chunks")
|
227
223
|
|
228
224
|
return vector_store
|
229
|
-
except Exception
|
230
|
-
logger.
|
225
|
+
except Exception:
|
226
|
+
logger.exception("Error loading vector store")
|
231
227
|
logger.info("Will create a new vector store")
|
232
228
|
|
233
229
|
# Create a new vector store
|
@@ -241,8 +237,8 @@ class LocalEmbeddingManager:
|
|
241
237
|
try:
|
242
238
|
with open(index_metadata_path, "r") as f:
|
243
239
|
return json.load(f)
|
244
|
-
except Exception
|
245
|
-
logger.
|
240
|
+
except Exception:
|
241
|
+
logger.exception("Error loading index metadata")
|
246
242
|
|
247
243
|
return {}
|
248
244
|
|
@@ -253,8 +249,8 @@ class LocalEmbeddingManager:
|
|
253
249
|
try:
|
254
250
|
with open(index_metadata_path, "w") as f:
|
255
251
|
json.dump(self.indexed_folders, f, indent=2)
|
256
|
-
except Exception
|
257
|
-
logger.
|
252
|
+
except Exception:
|
253
|
+
logger.exception("Error saving index metadata")
|
258
254
|
|
259
255
|
@staticmethod
|
260
256
|
def get_folder_hash(folder_path: Path) -> str:
|
@@ -397,8 +393,8 @@ class LocalEmbeddingManager:
|
|
397
393
|
normalize_L2=True,
|
398
394
|
)
|
399
395
|
logger.info(f"Loaded index for {folder_path} from disk")
|
400
|
-
except Exception
|
401
|
-
logger.
|
396
|
+
except Exception:
|
397
|
+
logger.exception(f"Error loading index for {folder_path}")
|
402
398
|
# If loading fails, force reindexing
|
403
399
|
force_reindex = True
|
404
400
|
|
@@ -574,8 +570,8 @@ class LocalEmbeddingManager:
|
|
574
570
|
allow_dangerous_deserialization=True,
|
575
571
|
normalize_L2=True,
|
576
572
|
)
|
577
|
-
except Exception
|
578
|
-
logger.
|
573
|
+
except Exception:
|
574
|
+
logger.exception(f"Error loading index for {folder_path}")
|
579
575
|
continue
|
580
576
|
|
581
577
|
# Search in this folder
|
@@ -599,8 +595,8 @@ class LocalEmbeddingManager:
|
|
599
595
|
}
|
600
596
|
|
601
597
|
all_results.append(result)
|
602
|
-
except Exception
|
603
|
-
logger.
|
598
|
+
except Exception:
|
599
|
+
logger.exception(f"Error searching in {folder_path}")
|
604
600
|
|
605
601
|
# Sort by similarity (highest first)
|
606
602
|
all_results.sort(key=lambda x: x["similarity"], reverse=True)
|
@@ -2,19 +2,16 @@
|
|
2
2
|
Search engine that searches across all local collections
|
3
3
|
"""
|
4
4
|
|
5
|
-
import logging
|
6
5
|
from typing import Any, Dict, List, Optional, cast
|
7
6
|
|
8
7
|
from langchain_core.language_models import BaseLLM
|
8
|
+
from loguru import logger
|
9
9
|
|
10
10
|
from ..search_engine_base import BaseSearchEngine
|
11
11
|
from ..search_engine_factory import create_search_engine
|
12
12
|
from ..search_engines_config import local_search_engines
|
13
13
|
from .search_engine_local import LocalSearchEngine
|
14
14
|
|
15
|
-
# Setup logging
|
16
|
-
logger = logging.getLogger(__name__)
|
17
|
-
|
18
15
|
|
19
16
|
class LocalAllSearchEngine(BaseSearchEngine):
|
20
17
|
"""
|
@@ -62,9 +59,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
|
|
62
59
|
"name": engine.name,
|
63
60
|
"description": engine.description,
|
64
61
|
}
|
65
|
-
except Exception
|
66
|
-
logger.
|
67
|
-
f"Error creating search engine for collection '{collection_id}'
|
62
|
+
except Exception:
|
63
|
+
logger.exception(
|
64
|
+
f"Error creating search engine for collection '{collection_id}'"
|
68
65
|
)
|
69
66
|
except ImportError:
|
70
67
|
logger.warning("No local collections configuration found")
|
@@ -97,8 +94,8 @@ class LocalAllSearchEngine(BaseSearchEngine):
|
|
97
94
|
preview["collection_description"] = engine_info["description"]
|
98
95
|
|
99
96
|
all_previews.extend(previews)
|
100
|
-
except Exception
|
101
|
-
logger.
|
97
|
+
except Exception:
|
98
|
+
logger.exception(f"Error searching collection '{collection_id}'")
|
102
99
|
|
103
100
|
if not all_previews:
|
104
101
|
logger.info(f"No local documents found for query: {query}")
|
@@ -139,9 +136,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
|
|
139
136
|
try:
|
140
137
|
results = engine._get_full_content(items)
|
141
138
|
all_results.extend(results)
|
142
|
-
except Exception
|
143
|
-
logger.
|
144
|
-
f"Error getting full content from collection '{collection_id}'
|
139
|
+
except Exception:
|
140
|
+
logger.exception(
|
141
|
+
f"Error getting full content from collection '{collection_id}'"
|
145
142
|
)
|
146
143
|
# Fall back to returning the items without full content
|
147
144
|
all_results.extend(items)
|
@@ -1,20 +1,16 @@
|
|
1
1
|
import enum
|
2
|
-
import logging
|
3
2
|
import os
|
4
3
|
import time
|
5
4
|
from typing import Any, Dict, List, Optional
|
6
5
|
|
7
6
|
import requests
|
8
7
|
from langchain_core.language_models import BaseLLM
|
8
|
+
from loguru import logger
|
9
9
|
|
10
10
|
from ...config import search_config
|
11
11
|
from ..search_engine_base import BaseSearchEngine
|
12
12
|
from .full_search import FullSearchResults
|
13
13
|
|
14
|
-
# Setup logging
|
15
|
-
logging.basicConfig(level=logging.INFO)
|
16
|
-
logger = logging.getLogger(__name__)
|
17
|
-
|
18
14
|
|
19
15
|
@enum.unique
|
20
16
|
class SafeSearchSetting(enum.IntEnum):
|
@@ -70,9 +66,8 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
70
66
|
llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
|
71
67
|
)
|
72
68
|
|
73
|
-
self.instance_url = instance_url
|
74
69
|
# Validate and normalize the instance URL if provided
|
75
|
-
self.instance_url =
|
70
|
+
self.instance_url = instance_url.rstrip("/")
|
76
71
|
logger.info(f"SearXNG initialized with instance URL: {self.instance_url}")
|
77
72
|
try:
|
78
73
|
# Make sure it's accessible.
|
@@ -182,8 +177,8 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
182
177
|
self.instance_url, headers=initial_headers, timeout=10
|
183
178
|
)
|
184
179
|
cookies = initial_response.cookies
|
185
|
-
except Exception
|
186
|
-
logger.
|
180
|
+
except Exception:
|
181
|
+
logger.exception("Failed to get initial cookies")
|
187
182
|
cookies = None
|
188
183
|
|
189
184
|
params = {
|
@@ -311,15 +306,15 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
311
306
|
except ImportError:
|
312
307
|
logger.error("BeautifulSoup not available for HTML parsing")
|
313
308
|
return []
|
314
|
-
except Exception
|
315
|
-
logger.
|
309
|
+
except Exception:
|
310
|
+
logger.exception("Error parsing HTML results")
|
316
311
|
return []
|
317
312
|
else:
|
318
313
|
logger.error(f"SearXNG returned status code {response.status_code}")
|
319
314
|
return []
|
320
315
|
|
321
|
-
except Exception
|
322
|
-
logger.
|
316
|
+
except Exception:
|
317
|
+
logger.exception("Error getting SearXNG results")
|
323
318
|
return []
|
324
319
|
|
325
320
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
@@ -391,8 +386,8 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
391
386
|
results_with_content = self.full_search._get_full_content(relevant_items)
|
392
387
|
return results_with_content
|
393
388
|
|
394
|
-
except Exception
|
395
|
-
logger.
|
389
|
+
except Exception:
|
390
|
+
logger.exception("Error retrieving full content")
|
396
391
|
return relevant_items
|
397
392
|
|
398
393
|
def invoke(self, query: str) -> List[Dict[str, Any]]:
|
@@ -511,7 +506,7 @@ https://searxng.github.io/searxng/admin/installation.html
|
|
511
506
|
results = super().run(query)
|
512
507
|
logger.info(f"SearXNG search completed with {len(results)} results")
|
513
508
|
return results
|
514
|
-
except Exception
|
515
|
-
logger.
|
509
|
+
except Exception:
|
510
|
+
logger.exception("Error in SearXNG run method")
|
516
511
|
# Return empty results on error
|
517
512
|
return []
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import logging
|
2
|
-
import os
|
3
2
|
from typing import Any, Dict, List, Optional
|
4
3
|
|
5
4
|
from langchain_community.utilities import SerpAPIWrapper
|
@@ -64,11 +63,16 @@ class SerpAPISearchEngine(BaseSearchEngine):
|
|
64
63
|
"russian": "ru",
|
65
64
|
}
|
66
65
|
|
67
|
-
# Get API key
|
68
|
-
|
66
|
+
# Get API key - check params, env vars, or database
|
67
|
+
from ...utilities.db_utils import get_db_setting
|
68
|
+
|
69
|
+
serpapi_api_key = api_key
|
70
|
+
if not serpapi_api_key:
|
71
|
+
serpapi_api_key = get_db_setting("search.engine.web.serpapi.api_key")
|
72
|
+
|
69
73
|
if not serpapi_api_key:
|
70
74
|
raise ValueError(
|
71
|
-
"
|
75
|
+
"SerpAPI key not found. Please provide api_key parameter, set the SERP_API_KEY environment variable, or set it in the UI settings."
|
72
76
|
)
|
73
77
|
|
74
78
|
# Get language code
|