local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +1 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
- local_deep_research/advanced_search_system/findings/repository.py +0 -3
- local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
- local_deep_research/api/benchmark_functions.py +288 -0
- local_deep_research/api/research_functions.py +8 -4
- local_deep_research/benchmarks/README.md +162 -0
- local_deep_research/benchmarks/__init__.py +51 -0
- local_deep_research/benchmarks/benchmark_functions.py +353 -0
- local_deep_research/benchmarks/cli/__init__.py +16 -0
- local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
- local_deep_research/benchmarks/cli.py +347 -0
- local_deep_research/benchmarks/comparison/__init__.py +12 -0
- local_deep_research/benchmarks/comparison/evaluator.py +768 -0
- local_deep_research/benchmarks/datasets/__init__.py +53 -0
- local_deep_research/benchmarks/datasets/base.py +295 -0
- local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
- local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
- local_deep_research/benchmarks/datasets/utils.py +116 -0
- local_deep_research/benchmarks/datasets.py +31 -0
- local_deep_research/benchmarks/efficiency/__init__.py +14 -0
- local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
- local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
- local_deep_research/benchmarks/evaluators/__init__.py +18 -0
- local_deep_research/benchmarks/evaluators/base.py +74 -0
- local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
- local_deep_research/benchmarks/evaluators/composite.py +121 -0
- local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
- local_deep_research/benchmarks/graders.py +410 -0
- local_deep_research/benchmarks/metrics/README.md +80 -0
- local_deep_research/benchmarks/metrics/__init__.py +24 -0
- local_deep_research/benchmarks/metrics/calculation.py +385 -0
- local_deep_research/benchmarks/metrics/reporting.py +155 -0
- local_deep_research/benchmarks/metrics/visualization.py +205 -0
- local_deep_research/benchmarks/metrics.py +11 -0
- local_deep_research/benchmarks/optimization/__init__.py +32 -0
- local_deep_research/benchmarks/optimization/api.py +274 -0
- local_deep_research/benchmarks/optimization/metrics.py +20 -0
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
- local_deep_research/benchmarks/runners.py +434 -0
- local_deep_research/benchmarks/templates.py +65 -0
- local_deep_research/config/llm_config.py +26 -23
- local_deep_research/config/search_config.py +1 -5
- local_deep_research/defaults/default_settings.json +108 -7
- local_deep_research/search_system.py +16 -8
- local_deep_research/utilities/db_utils.py +3 -6
- local_deep_research/utilities/es_utils.py +441 -0
- local_deep_research/utilities/log_utils.py +36 -0
- local_deep_research/utilities/search_utilities.py +8 -9
- local_deep_research/web/app.py +15 -10
- local_deep_research/web/app_factory.py +9 -12
- local_deep_research/web/database/migrations.py +8 -5
- local_deep_research/web/database/models.py +20 -0
- local_deep_research/web/database/schema_upgrade.py +5 -8
- local_deep_research/web/models/database.py +15 -18
- local_deep_research/web/routes/benchmark_routes.py +427 -0
- local_deep_research/web/routes/research_routes.py +13 -17
- local_deep_research/web/routes/settings_routes.py +264 -67
- local_deep_research/web/services/research_service.py +58 -73
- local_deep_research/web/services/settings_manager.py +1 -4
- local_deep_research/web/services/settings_service.py +4 -6
- local_deep_research/web/static/css/styles.css +12 -0
- local_deep_research/web/static/js/components/logpanel.js +164 -155
- local_deep_research/web/static/js/components/research.js +44 -3
- local_deep_research/web/static/js/components/settings.js +27 -0
- local_deep_research/web/static/js/services/socket.js +47 -0
- local_deep_research/web_search_engines/default_search_engines.py +38 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
- local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
- local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
- local_deep_research/web_search_engines/search_engine_base.py +22 -5
- local_deep_research/web_search_engines/search_engine_factory.py +30 -11
- local_deep_research/web_search_engines/search_engines_config.py +14 -1
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/METADATA +10 -2
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/RECORD +93 -51
- local_deep_research/app.py +0 -8
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/WHEEL +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -718,18 +718,48 @@
|
|
718
718
|
usingFallbackModels = true;
|
719
719
|
}
|
720
720
|
} else if (providerUpper === 'OPENAI_ENDPOINT') {
|
721
|
-
// For custom endpoints, show a mix of models as examples
|
722
721
|
models = allModels.filter(model => {
|
723
722
|
if (!model || typeof model !== 'object') return false;
|
724
723
|
|
725
724
|
// Skip provider options
|
726
725
|
if (model.value && !model.id && !model.name) return false;
|
727
726
|
|
728
|
-
// Include OpenAI and Anthropic models as examples
|
729
727
|
const modelProvider = (model.provider || '').toUpperCase();
|
730
|
-
return modelProvider === '
|
728
|
+
return modelProvider === 'OPENAI_ENDPOINT';
|
731
729
|
});
|
732
730
|
|
731
|
+
console.log(`Found ${models.length} models with provider="OPENAI_ENDPOINT"`);
|
732
|
+
|
733
|
+
if (models.length === 0) {
|
734
|
+
console.log('No OPENAI_ENDPOINT models found, checking for models with "Custom" in label');
|
735
|
+
models = allModels.filter(model => {
|
736
|
+
if (!model || typeof model !== 'object') return false;
|
737
|
+
|
738
|
+
// Skip provider options
|
739
|
+
if (model.value && !model.id && !model.name) return false;
|
740
|
+
|
741
|
+
const modelLabel = (model.label || '').toLowerCase();
|
742
|
+
return modelLabel.includes('custom');
|
743
|
+
});
|
744
|
+
|
745
|
+
console.log(`Found ${models.length} models with "Custom" in label`);
|
746
|
+
}
|
747
|
+
|
748
|
+
if (models.length === 0) {
|
749
|
+
console.log('No OPENAI_ENDPOINT or Custom models found, using OpenAI models as examples');
|
750
|
+
models = allModels.filter(model => {
|
751
|
+
if (!model || typeof model !== 'object') return false;
|
752
|
+
|
753
|
+
// Skip provider options
|
754
|
+
if (model.value && !model.id && !model.name) return false;
|
755
|
+
|
756
|
+
const modelProvider = (model.provider || '').toUpperCase();
|
757
|
+
const modelId = (model.id || model.value || '').toLowerCase();
|
758
|
+
return modelProvider === 'OPENAI' ||
|
759
|
+
modelId.includes('gpt');
|
760
|
+
});
|
761
|
+
}
|
762
|
+
|
733
763
|
// Add fallbacks if necessary
|
734
764
|
if (models.length === 0) {
|
735
765
|
console.log('No models found for custom endpoint, using fallbacks');
|
@@ -1411,6 +1441,17 @@
|
|
1411
1441
|
});
|
1412
1442
|
}
|
1413
1443
|
|
1444
|
+
// Process Custom OpenAI Endpoint models
|
1445
|
+
if (data.providers && data.providers.openai_endpoint_models) {
|
1446
|
+
data.providers.openai_endpoint_models.forEach(model => {
|
1447
|
+
formatted.push({
|
1448
|
+
...model,
|
1449
|
+
id: model.value,
|
1450
|
+
provider: 'OPENAI_ENDPOINT'
|
1451
|
+
});
|
1452
|
+
});
|
1453
|
+
}
|
1454
|
+
|
1414
1455
|
return formatted;
|
1415
1456
|
}
|
1416
1457
|
|
@@ -2761,6 +2761,21 @@
|
|
2761
2761
|
});
|
2762
2762
|
}
|
2763
2763
|
|
2764
|
+
// Add Custom OpenAI Endpoint models if available
|
2765
|
+
if (data.providers && data.providers.openai_endpoint_models && data.providers.openai_endpoint_models.length > 0) {
|
2766
|
+
const openai_endpoint_models = data.providers.openai_endpoint_models;
|
2767
|
+
console.log('Found OpenAI Endpoint models:', openai_endpoint_models.length);
|
2768
|
+
|
2769
|
+
// Add provider information to each model
|
2770
|
+
openai_endpoint_models.forEach(model => {
|
2771
|
+
formattedModels.push({
|
2772
|
+
value: model.value,
|
2773
|
+
label: model.label,
|
2774
|
+
provider: 'OPENAI_ENDPOINT' // Ensure provider field is added
|
2775
|
+
});
|
2776
|
+
});
|
2777
|
+
}
|
2778
|
+
|
2764
2779
|
// Update the global modelOptions array
|
2765
2780
|
modelOptions = formattedModels;
|
2766
2781
|
console.log('Final modelOptions:', modelOptions.length, 'models');
|
@@ -3673,6 +3688,18 @@
|
|
3673
3688
|
|
3674
3689
|
return false;
|
3675
3690
|
}
|
3691
|
+
|
3692
|
+
if (providerUpper === 'OPENAI_ENDPOINT') {
|
3693
|
+
if (model.provider && model.provider.toUpperCase() === 'OPENAI_ENDPOINT') {
|
3694
|
+
return true;
|
3695
|
+
}
|
3696
|
+
|
3697
|
+
if (model.label && model.label.toLowerCase().includes('custom')) {
|
3698
|
+
return true;
|
3699
|
+
}
|
3700
|
+
|
3701
|
+
return false;
|
3702
|
+
}
|
3676
3703
|
|
3677
3704
|
// For other providers, use standard matching
|
3678
3705
|
if (model.provider) {
|
@@ -85,6 +85,29 @@ window.socket = (function() {
|
|
85
85
|
}
|
86
86
|
});
|
87
87
|
|
88
|
+
// Add handler for search engine selection events
|
89
|
+
socket.on('search_engine_selected', (data) => {
|
90
|
+
console.log('Received search_engine_selected event:', data);
|
91
|
+
if (data && data.engine) {
|
92
|
+
const engineName = data.engine;
|
93
|
+
const resultCount = data.result_count || 0;
|
94
|
+
|
95
|
+
// Add to log panel
|
96
|
+
if (typeof window.addConsoleLog === 'function') {
|
97
|
+
// Format engine name - capitalize first letter
|
98
|
+
const displayEngineName = engineName.charAt(0).toUpperCase() + engineName.slice(1);
|
99
|
+
const message = `Search engine selected: ${displayEngineName} (found ${resultCount} results)`;
|
100
|
+
window.addConsoleLog(message, 'info', {
|
101
|
+
type: 'info',
|
102
|
+
phase: 'engine_selected',
|
103
|
+
engine: engineName,
|
104
|
+
result_count: resultCount,
|
105
|
+
is_engine_selection: true
|
106
|
+
});
|
107
|
+
}
|
108
|
+
}
|
109
|
+
});
|
110
|
+
|
88
111
|
socket.on('disconnect', (reason) => {
|
89
112
|
console.log('Socket disconnected:', reason);
|
90
113
|
|
@@ -237,6 +260,30 @@ window.socket = (function() {
|
|
237
260
|
});
|
238
261
|
}
|
239
262
|
|
263
|
+
// Handle special engine selection events
|
264
|
+
if (data.event === 'search_engine_selected' || (data.engine && data.result_count !== undefined)) {
|
265
|
+
// Extract engine information
|
266
|
+
const engineName = data.engine || 'unknown';
|
267
|
+
const resultCount = data.result_count || 0;
|
268
|
+
|
269
|
+
// Log the event
|
270
|
+
console.log(`Search engine selected: ${engineName} (found ${resultCount} results)`);
|
271
|
+
|
272
|
+
// Add to log panel as an info message with special metadata
|
273
|
+
if (typeof window.addConsoleLog === 'function') {
|
274
|
+
// Format engine name - capitalize first letter
|
275
|
+
const displayEngineName = engineName.charAt(0).toUpperCase() + engineName.slice(1);
|
276
|
+
const message = `Search engine selected: ${displayEngineName} (found ${resultCount} results)`;
|
277
|
+
window.addConsoleLog(message, 'info', {
|
278
|
+
type: 'info',
|
279
|
+
phase: 'engine_selected',
|
280
|
+
engine: engineName,
|
281
|
+
result_count: resultCount,
|
282
|
+
is_engine_selection: true
|
283
|
+
});
|
284
|
+
}
|
285
|
+
}
|
286
|
+
|
240
287
|
// Initialize message tracking if not exists
|
241
288
|
window._processedSocketMessages = window._processedSocketMessages || new Map();
|
242
289
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
"""
|
2
|
+
Default search engine configurations.
|
3
|
+
This file can be used to initialize the search engine configurations.
|
4
|
+
"""
|
5
|
+
|
6
|
+
def get_default_elasticsearch_config():
|
7
|
+
"""
|
8
|
+
Returns the default Elasticsearch search engine configuration.
|
9
|
+
|
10
|
+
Returns:
|
11
|
+
dict: Default configuration for Elasticsearch search engine
|
12
|
+
"""
|
13
|
+
return {
|
14
|
+
"module_path": "local_deep_research.web_search_engines.engines.search_engine_elasticsearch",
|
15
|
+
"class_name": "ElasticsearchSearchEngine",
|
16
|
+
"requires_llm": True,
|
17
|
+
"default_params": {
|
18
|
+
"hosts": ["http://172.16.4.131:9200"],
|
19
|
+
"index_name": "sample_documents",
|
20
|
+
"highlight_fields": ["content", "title"],
|
21
|
+
"search_fields": ["content", "title"],
|
22
|
+
},
|
23
|
+
"description": "Search engine for Elasticsearch databases",
|
24
|
+
"strengths": "Efficient for searching document collections and structured data",
|
25
|
+
"weaknesses": "Requires an Elasticsearch instance and properly indexed data",
|
26
|
+
"reliability": "High, depending on your Elasticsearch setup",
|
27
|
+
}
|
28
|
+
|
29
|
+
def get_default_search_engine_configs():
|
30
|
+
"""
|
31
|
+
Returns a dictionary of default search engine configurations.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
dict: Dictionary of default search engine configurations
|
35
|
+
"""
|
36
|
+
return {
|
37
|
+
"elasticsearch": get_default_elasticsearch_config(),
|
38
|
+
}
|
@@ -1,6 +1,7 @@
|
|
1
|
-
import logging
|
2
1
|
from typing import Any, Dict, List, Optional
|
3
2
|
|
3
|
+
from loguru import logger
|
4
|
+
|
4
5
|
from ...utilities.db_utils import get_db_setting
|
5
6
|
from ...web.services.socket_service import emit_socket_event
|
6
7
|
from ..search_engine_base import BaseSearchEngine
|
@@ -8,10 +9,6 @@ from ..search_engine_factory import create_search_engine
|
|
8
9
|
from ..search_engines_config import search_config
|
9
10
|
from .search_engine_wikipedia import WikipediaSearchEngine
|
10
11
|
|
11
|
-
# Setup logging
|
12
|
-
logging.basicConfig(level=logging.INFO)
|
13
|
-
logger = logging.getLogger(__name__)
|
14
|
-
|
15
12
|
|
16
13
|
class MetaSearchEngine(BaseSearchEngine):
|
17
14
|
"""
|
@@ -120,6 +117,8 @@ class MetaSearchEngine(BaseSearchEngine):
|
|
120
117
|
def analyze_query(self, query: str) -> List[str]:
|
121
118
|
"""
|
122
119
|
Analyze the query to determine the best search engines to use.
|
120
|
+
Prioritizes SearXNG for general queries, but selects specialized engines
|
121
|
+
for domain-specific queries (e.g., scientific papers, code).
|
123
122
|
|
124
123
|
Args:
|
125
124
|
query: The search query
|
@@ -128,10 +127,57 @@ class MetaSearchEngine(BaseSearchEngine):
|
|
128
127
|
List of search engine names sorted by suitability
|
129
128
|
"""
|
130
129
|
try:
|
131
|
-
#
|
132
|
-
|
130
|
+
# First check if this is a specialized query that should use specific engines
|
131
|
+
specialized_domains = {
|
132
|
+
"scientific paper": ["arxiv", "pubmed", "wikipedia"],
|
133
|
+
"medical research": ["pubmed", "searxng"],
|
134
|
+
"clinical": ["pubmed", "searxng"],
|
135
|
+
"github": ["github", "searxng"],
|
136
|
+
"repository": ["github", "searxng"],
|
137
|
+
"code": ["github", "searxng"],
|
138
|
+
"programming": ["github", "searxng"],
|
139
|
+
}
|
140
|
+
|
141
|
+
# Quick heuristic check for specialized queries
|
142
|
+
query_lower = query.lower()
|
143
|
+
for term, engines in specialized_domains.items():
|
144
|
+
if term in query_lower:
|
145
|
+
valid_engines = []
|
146
|
+
for engine in engines:
|
147
|
+
if engine in self.available_engines:
|
148
|
+
valid_engines.append(engine)
|
149
|
+
|
150
|
+
if valid_engines:
|
151
|
+
logger.info(
|
152
|
+
f"Detected specialized query type: {term}, using engines: {valid_engines}"
|
153
|
+
)
|
154
|
+
return valid_engines
|
155
|
+
|
156
|
+
# For searches containing "arxiv", prioritize the arxiv engine
|
157
|
+
if "arxiv" in query_lower and "arxiv" in self.available_engines:
|
158
|
+
return ["arxiv"] + [e for e in self.available_engines if e != "arxiv"]
|
159
|
+
|
160
|
+
# For searches containing "pubmed", prioritize the pubmed engine
|
161
|
+
if "pubmed" in query_lower and "pubmed" in self.available_engines:
|
162
|
+
return ["pubmed"] + [e for e in self.available_engines if e != "pubmed"]
|
163
|
+
|
164
|
+
# Check if SearXNG is available and prioritize it for general queries
|
165
|
+
if "searxng" in self.available_engines:
|
166
|
+
# For general queries, return SearXNG first followed by reliability-ordered engines
|
167
|
+
engines_without_searxng = [
|
168
|
+
e for e in self.available_engines if e != "searxng"
|
169
|
+
]
|
170
|
+
reliability_sorted = sorted(
|
171
|
+
engines_without_searxng,
|
172
|
+
key=lambda x: search_config().get(x, {}).get("reliability", 0),
|
173
|
+
reverse=True,
|
174
|
+
)
|
175
|
+
return ["searxng"] + reliability_sorted
|
176
|
+
|
177
|
+
# If LLM is not available or SearXNG is not available, fall back to reliability
|
178
|
+
if not self.llm or "searxng" not in self.available_engines:
|
133
179
|
logger.warning(
|
134
|
-
"No LLM available
|
180
|
+
"No LLM available or SearXNG not available, using reliability-based engines"
|
135
181
|
)
|
136
182
|
# Return engines sorted by reliability
|
137
183
|
return sorted(
|
@@ -157,8 +203,8 @@ class MetaSearchEngine(BaseSearchEngine):
|
|
157
203
|
engines_info.append(
|
158
204
|
f"- {engine_name}: {description}\n Strengths: {strengths}\n Weaknesses: {weaknesses}"
|
159
205
|
)
|
160
|
-
except KeyError
|
161
|
-
logger.
|
206
|
+
except KeyError:
|
207
|
+
logger.exception(f"Missing key for engine {engine_name}")
|
162
208
|
|
163
209
|
# Only proceed if we have engines available to choose from
|
164
210
|
if not engines_info:
|
@@ -171,6 +217,7 @@ class MetaSearchEngine(BaseSearchEngine):
|
|
171
217
|
reverse=True,
|
172
218
|
)
|
173
219
|
|
220
|
+
# Use a stronger prompt that emphasizes SearXNG preference for general queries
|
174
221
|
prompt = f"""You are a search query analyst. Consider this search query:
|
175
222
|
|
176
223
|
QUERY: {query}
|
@@ -179,11 +226,17 @@ I have these search engines available:
|
|
179
226
|
{chr(10).join(engines_info)}
|
180
227
|
|
181
228
|
Determine which search engines would be most appropriate for answering this query.
|
182
|
-
First analyze the nature of the query
|
183
|
-
|
229
|
+
First analyze the nature of the query: Is it factual, scientific, code-related, medical, etc.?
|
230
|
+
|
231
|
+
IMPORTANT GUIDELINES:
|
232
|
+
- Use SearXNG for most general queries as it combines results from multiple search engines
|
233
|
+
- For academic/scientific searches, prefer arXiv
|
234
|
+
- For medical research, prefer PubMed
|
235
|
+
- For code repositories and programming, prefer GitHub
|
236
|
+
- For every other query type, SearXNG is usually the best option
|
184
237
|
|
185
|
-
Output ONLY a comma-separated list of
|
186
|
-
Example output: wikipedia,
|
238
|
+
Output ONLY a comma-separated list of 1-3 search engine names in order of most appropriate to least appropriate.
|
239
|
+
Example output: searxng,wikipedia,brave"""
|
187
240
|
|
188
241
|
# Get analysis from LLM
|
189
242
|
response = self.llm.invoke(prompt)
|
@@ -201,7 +254,16 @@ Example output: wikipedia,arxiv,github"""
|
|
201
254
|
if cleaned_name in self.available_engines:
|
202
255
|
valid_engines.append(cleaned_name)
|
203
256
|
|
204
|
-
# If
|
257
|
+
# If SearXNG is available but not selected by the LLM, add it as a fallback
|
258
|
+
if "searxng" in self.available_engines and "searxng" not in valid_engines:
|
259
|
+
# Add it as the last option if the LLM selected others
|
260
|
+
if valid_engines:
|
261
|
+
valid_engines.append("searxng")
|
262
|
+
# Use it as the first option if no valid engines were selected
|
263
|
+
else:
|
264
|
+
valid_engines = ["searxng"]
|
265
|
+
|
266
|
+
# If still no valid engines, use reliability-based ordering
|
205
267
|
if not valid_engines:
|
206
268
|
valid_engines = sorted(
|
207
269
|
self.available_engines,
|
@@ -210,14 +272,21 @@ Example output: wikipedia,arxiv,github"""
|
|
210
272
|
)
|
211
273
|
|
212
274
|
return valid_engines
|
213
|
-
except Exception
|
214
|
-
logger.
|
215
|
-
# Fall back to reliability-based ordering
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
275
|
+
except Exception:
|
276
|
+
logger.exception("Error analyzing query with LLM")
|
277
|
+
# Fall back to SearXNG if available, then reliability-based ordering
|
278
|
+
if "searxng" in self.available_engines:
|
279
|
+
return ["searxng"] + sorted(
|
280
|
+
[e for e in self.available_engines if e != "searxng"],
|
281
|
+
key=lambda x: search_config().get(x, {}).get("reliability", 0),
|
282
|
+
reverse=True,
|
283
|
+
)
|
284
|
+
else:
|
285
|
+
return sorted(
|
286
|
+
self.available_engines,
|
287
|
+
key=lambda x: search_config().get(x, {}).get("reliability", 0),
|
288
|
+
reverse=True,
|
289
|
+
)
|
221
290
|
|
222
291
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
223
292
|
"""
|
@@ -277,10 +346,8 @@ Example output: wikipedia,arxiv,github"""
|
|
277
346
|
"search_engine_selected",
|
278
347
|
{"engine": engine_name, "result_count": len(previews)},
|
279
348
|
)
|
280
|
-
except Exception
|
281
|
-
logger.error(
|
282
|
-
f"Socket emit error (non-critical): {str(socket_error)}"
|
283
|
-
)
|
349
|
+
except Exception:
|
350
|
+
logger.exception("Socket emit error (non-critical)")
|
284
351
|
|
285
352
|
return previews
|
286
353
|
|
@@ -289,7 +356,7 @@ Example output: wikipedia,arxiv,github"""
|
|
289
356
|
|
290
357
|
except Exception as e:
|
291
358
|
error_msg = f"Error getting previews from {engine_name}: {str(e)}"
|
292
|
-
logger.
|
359
|
+
logger.exception(error_msg)
|
293
360
|
all_errors.append(error_msg)
|
294
361
|
|
295
362
|
# If we reach here, all engines failed, use fallback
|
@@ -325,9 +392,9 @@ Example output: wikipedia,arxiv,github"""
|
|
325
392
|
try:
|
326
393
|
logger.info(f"Using {self._selected_engine_name} to get full content")
|
327
394
|
return self._selected_engine._get_full_content(relevant_items)
|
328
|
-
except Exception
|
329
|
-
logger.
|
330
|
-
f"Error getting full content from {self._selected_engine_name}
|
395
|
+
except Exception:
|
396
|
+
logger.exception(
|
397
|
+
f"Error getting full content from {self._selected_engine_name}"
|
331
398
|
)
|
332
399
|
# Fall back to returning relevant items without full content
|
333
400
|
return relevant_items
|
@@ -354,8 +421,8 @@ Example output: wikipedia,arxiv,github"""
|
|
354
421
|
common_params["max_filtered_results"] = self.max_filtered_results
|
355
422
|
|
356
423
|
engine = create_search_engine(engine_name, **common_params)
|
357
|
-
except Exception
|
358
|
-
logger.
|
424
|
+
except Exception:
|
425
|
+
logger.exception(f"Error creating engine instance for {engine_name}")
|
359
426
|
return None
|
360
427
|
|
361
428
|
if engine:
|
@@ -1,14 +1,15 @@
|
|
1
|
-
import logging
|
2
1
|
from typing import Any, Dict, List, Optional
|
3
2
|
|
4
3
|
import arxiv
|
5
4
|
from langchain_core.language_models import BaseLLM
|
5
|
+
from loguru import logger
|
6
6
|
|
7
|
+
from ...advanced_search_system.filters.journal_reputation_filter import (
|
8
|
+
JournalReputationFilter,
|
9
|
+
)
|
7
10
|
from ...config import search_config
|
8
11
|
from ..search_engine_base import BaseSearchEngine
|
9
12
|
|
10
|
-
logger = logging.getLogger(__name__)
|
11
|
-
|
12
13
|
|
13
14
|
class ArXivSearchEngine(BaseSearchEngine):
|
14
15
|
"""arXiv search engine implementation with two-phase approach"""
|
@@ -37,9 +38,22 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
37
38
|
llm: Language model for relevance filtering
|
38
39
|
max_filtered_results: Maximum number of results to keep after filtering
|
39
40
|
"""
|
41
|
+
# Initialize the journal reputation filter if needed.
|
42
|
+
content_filters = []
|
43
|
+
journal_filter = JournalReputationFilter.create_default(
|
44
|
+
model=llm, engine_name="arxiv"
|
45
|
+
)
|
46
|
+
if journal_filter is not None:
|
47
|
+
content_filters.append(journal_filter)
|
48
|
+
|
40
49
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
41
50
|
super().__init__(
|
42
|
-
llm=llm,
|
51
|
+
llm=llm,
|
52
|
+
max_filtered_results=max_filtered_results,
|
53
|
+
max_results=max_results,
|
54
|
+
# We deliberately do this filtering after relevancy checks,
|
55
|
+
# because it is potentially quite slow.
|
56
|
+
content_filters=content_filters,
|
43
57
|
)
|
44
58
|
self.max_results = max(self.max_results, 25)
|
45
59
|
self.sort_by = sort_by
|
@@ -133,14 +147,15 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
133
147
|
if paper.published
|
134
148
|
else None
|
135
149
|
),
|
150
|
+
"journal_ref": paper.journal_ref,
|
136
151
|
}
|
137
152
|
|
138
153
|
previews.append(preview)
|
139
154
|
|
140
155
|
return previews
|
141
156
|
|
142
|
-
except Exception
|
143
|
-
logger.
|
157
|
+
except Exception:
|
158
|
+
logger.exception("Error getting arXiv previews")
|
144
159
|
return []
|
145
160
|
|
146
161
|
def _get_full_content(
|
@@ -203,7 +218,6 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
203
218
|
"categories": paper.categories,
|
204
219
|
"summary": paper.summary, # Full summary
|
205
220
|
"comment": paper.comment,
|
206
|
-
"journal_ref": paper.journal_ref,
|
207
221
|
"doi": paper.doi,
|
208
222
|
}
|
209
223
|
)
|
@@ -263,17 +277,17 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
263
277
|
"Successfully extracted text from PDF using pdfplumber"
|
264
278
|
)
|
265
279
|
except (ImportError, Exception) as e2:
|
266
|
-
logger.
|
280
|
+
logger.exception(
|
267
281
|
f"PDF text extraction failed: {str(e1)}, then {str(e2)}"
|
268
282
|
)
|
269
283
|
logger.error(
|
270
284
|
"Using paper summary as content instead"
|
271
285
|
)
|
272
|
-
except Exception
|
273
|
-
logger.
|
286
|
+
except Exception:
|
287
|
+
logger.exception("Error extracting text from PDF")
|
274
288
|
logger.error("Using paper summary as content instead")
|
275
|
-
except Exception
|
276
|
-
logger.
|
289
|
+
except Exception:
|
290
|
+
logger.exception(f"Error downloading paper {paper.title}")
|
277
291
|
result["pdf_path"] = None
|
278
292
|
pdf_count -= 1 # Decrement counter if download fails
|
279
293
|
elif (
|
@@ -349,6 +363,7 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
349
363
|
"authors": [
|
350
364
|
author.name for author in paper.authors[:3]
|
351
365
|
], # First 3 authors
|
366
|
+
"journal_ref": paper.journal_ref,
|
352
367
|
}
|
353
368
|
|
354
369
|
# Add full content if not in snippet-only mode
|
@@ -375,7 +390,6 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
375
390
|
"categories": paper.categories,
|
376
391
|
"summary": paper.summary, # Full summary
|
377
392
|
"comment": paper.comment,
|
378
|
-
"journal_ref": paper.journal_ref,
|
379
393
|
"doi": paper.doi,
|
380
394
|
"content": paper.summary, # Use summary as content
|
381
395
|
"full_content": paper.summary, # For consistency
|
@@ -388,13 +402,13 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
388
402
|
# Download the paper
|
389
403
|
paper_path = paper.download_pdf(dirpath=self.download_dir)
|
390
404
|
result["pdf_path"] = str(paper_path)
|
391
|
-
except Exception
|
392
|
-
logger.
|
405
|
+
except Exception:
|
406
|
+
logger.exception("Error downloading paper")
|
393
407
|
|
394
408
|
return result
|
395
409
|
|
396
|
-
except Exception
|
397
|
-
logger.
|
410
|
+
except Exception:
|
411
|
+
logger.exception("Error getting paper details")
|
398
412
|
return {}
|
399
413
|
|
400
414
|
def search_by_author(
|
@@ -64,11 +64,16 @@ class BraveSearchEngine(BaseSearchEngine):
|
|
64
64
|
"russian": "ru",
|
65
65
|
}
|
66
66
|
|
67
|
-
# Get API key
|
68
|
-
|
67
|
+
# Get API key - check params, env vars, or database
|
68
|
+
from ...utilities.db_utils import get_db_setting
|
69
|
+
|
70
|
+
brave_api_key = api_key
|
71
|
+
if not brave_api_key:
|
72
|
+
brave_api_key = get_db_setting("search.engine.web.brave.api_key")
|
73
|
+
|
69
74
|
if not brave_api_key:
|
70
75
|
raise ValueError(
|
71
|
-
"
|
76
|
+
"Brave API key not found. Please provide api_key parameter, set the BRAVE_API_KEY environment variable, or set it in the UI settings."
|
72
77
|
)
|
73
78
|
|
74
79
|
# Get language code
|