local-deep-research 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. local_deep_research/__version__.py +1 -1
  2. local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
  3. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
  4. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
  5. local_deep_research/advanced_search_system/findings/repository.py +0 -3
  6. local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
  7. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
  8. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
  9. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
  10. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
  11. local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
  12. local_deep_research/api/benchmark_functions.py +288 -0
  13. local_deep_research/api/research_functions.py +8 -4
  14. local_deep_research/benchmarks/README.md +162 -0
  15. local_deep_research/benchmarks/__init__.py +51 -0
  16. local_deep_research/benchmarks/benchmark_functions.py +353 -0
  17. local_deep_research/benchmarks/cli/__init__.py +16 -0
  18. local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
  19. local_deep_research/benchmarks/cli.py +347 -0
  20. local_deep_research/benchmarks/comparison/__init__.py +12 -0
  21. local_deep_research/benchmarks/comparison/evaluator.py +768 -0
  22. local_deep_research/benchmarks/datasets/__init__.py +53 -0
  23. local_deep_research/benchmarks/datasets/base.py +295 -0
  24. local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
  25. local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
  26. local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
  27. local_deep_research/benchmarks/datasets/utils.py +116 -0
  28. local_deep_research/benchmarks/datasets.py +31 -0
  29. local_deep_research/benchmarks/efficiency/__init__.py +14 -0
  30. local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
  31. local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
  32. local_deep_research/benchmarks/evaluators/__init__.py +18 -0
  33. local_deep_research/benchmarks/evaluators/base.py +74 -0
  34. local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
  35. local_deep_research/benchmarks/evaluators/composite.py +121 -0
  36. local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
  37. local_deep_research/benchmarks/graders.py +410 -0
  38. local_deep_research/benchmarks/metrics/README.md +80 -0
  39. local_deep_research/benchmarks/metrics/__init__.py +24 -0
  40. local_deep_research/benchmarks/metrics/calculation.py +385 -0
  41. local_deep_research/benchmarks/metrics/reporting.py +155 -0
  42. local_deep_research/benchmarks/metrics/visualization.py +205 -0
  43. local_deep_research/benchmarks/metrics.py +11 -0
  44. local_deep_research/benchmarks/optimization/__init__.py +32 -0
  45. local_deep_research/benchmarks/optimization/api.py +274 -0
  46. local_deep_research/benchmarks/optimization/metrics.py +20 -0
  47. local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
  48. local_deep_research/benchmarks/runners.py +434 -0
  49. local_deep_research/benchmarks/templates.py +65 -0
  50. local_deep_research/config/llm_config.py +26 -23
  51. local_deep_research/config/search_config.py +1 -5
  52. local_deep_research/defaults/default_settings.json +108 -7
  53. local_deep_research/search_system.py +16 -8
  54. local_deep_research/utilities/db_utils.py +3 -6
  55. local_deep_research/utilities/es_utils.py +441 -0
  56. local_deep_research/utilities/log_utils.py +36 -0
  57. local_deep_research/utilities/search_utilities.py +8 -9
  58. local_deep_research/web/app.py +7 -9
  59. local_deep_research/web/app_factory.py +9 -12
  60. local_deep_research/web/database/migrations.py +8 -5
  61. local_deep_research/web/database/models.py +20 -0
  62. local_deep_research/web/database/schema_upgrade.py +5 -8
  63. local_deep_research/web/models/database.py +15 -18
  64. local_deep_research/web/routes/benchmark_routes.py +427 -0
  65. local_deep_research/web/routes/research_routes.py +13 -17
  66. local_deep_research/web/routes/settings_routes.py +264 -67
  67. local_deep_research/web/services/research_service.py +47 -57
  68. local_deep_research/web/services/settings_manager.py +1 -4
  69. local_deep_research/web/services/settings_service.py +4 -6
  70. local_deep_research/web/static/css/styles.css +12 -0
  71. local_deep_research/web/static/js/components/logpanel.js +164 -155
  72. local_deep_research/web/static/js/components/research.js +44 -3
  73. local_deep_research/web/static/js/components/settings.js +27 -0
  74. local_deep_research/web/static/js/services/socket.js +47 -0
  75. local_deep_research/web_search_engines/default_search_engines.py +38 -0
  76. local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
  77. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
  78. local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
  79. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
  80. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
  81. local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
  82. local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
  83. local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
  84. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
  85. local_deep_research/web_search_engines/search_engine_base.py +22 -5
  86. local_deep_research/web_search_engines/search_engine_factory.py +32 -11
  87. local_deep_research/web_search_engines/search_engines_config.py +14 -1
  88. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/METADATA +10 -2
  89. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/RECORD +92 -49
  90. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/WHEEL +0 -0
  91. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/entry_points.txt +0 -0
  92. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -718,18 +718,48 @@
718
718
  usingFallbackModels = true;
719
719
  }
720
720
  } else if (providerUpper === 'OPENAI_ENDPOINT') {
721
- // For custom endpoints, show a mix of models as examples
722
721
  models = allModels.filter(model => {
723
722
  if (!model || typeof model !== 'object') return false;
724
723
 
725
724
  // Skip provider options
726
725
  if (model.value && !model.id && !model.name) return false;
727
726
 
728
- // Include OpenAI and Anthropic models as examples
729
727
  const modelProvider = (model.provider || '').toUpperCase();
730
- return modelProvider === 'OPENAI' || modelProvider === 'ANTHROPIC';
728
+ return modelProvider === 'OPENAI_ENDPOINT';
731
729
  });
732
730
 
731
+ console.log(`Found ${models.length} models with provider="OPENAI_ENDPOINT"`);
732
+
733
+ if (models.length === 0) {
734
+ console.log('No OPENAI_ENDPOINT models found, checking for models with "Custom" in label');
735
+ models = allModels.filter(model => {
736
+ if (!model || typeof model !== 'object') return false;
737
+
738
+ // Skip provider options
739
+ if (model.value && !model.id && !model.name) return false;
740
+
741
+ const modelLabel = (model.label || '').toLowerCase();
742
+ return modelLabel.includes('custom');
743
+ });
744
+
745
+ console.log(`Found ${models.length} models with "Custom" in label`);
746
+ }
747
+
748
+ if (models.length === 0) {
749
+ console.log('No OPENAI_ENDPOINT or Custom models found, using OpenAI models as examples');
750
+ models = allModels.filter(model => {
751
+ if (!model || typeof model !== 'object') return false;
752
+
753
+ // Skip provider options
754
+ if (model.value && !model.id && !model.name) return false;
755
+
756
+ const modelProvider = (model.provider || '').toUpperCase();
757
+ const modelId = (model.id || model.value || '').toLowerCase();
758
+ return modelProvider === 'OPENAI' ||
759
+ modelId.includes('gpt');
760
+ });
761
+ }
762
+
733
763
  // Add fallbacks if necessary
734
764
  if (models.length === 0) {
735
765
  console.log('No models found for custom endpoint, using fallbacks');
@@ -1411,6 +1441,17 @@
1411
1441
  });
1412
1442
  }
1413
1443
 
1444
+ // Process Custom OpenAI Endpoint models
1445
+ if (data.providers && data.providers.openai_endpoint_models) {
1446
+ data.providers.openai_endpoint_models.forEach(model => {
1447
+ formatted.push({
1448
+ ...model,
1449
+ id: model.value,
1450
+ provider: 'OPENAI_ENDPOINT'
1451
+ });
1452
+ });
1453
+ }
1454
+
1414
1455
  return formatted;
1415
1456
  }
1416
1457
 
@@ -2761,6 +2761,21 @@
2761
2761
  });
2762
2762
  }
2763
2763
 
2764
+ // Add Custom OpenAI Endpoint models if available
2765
+ if (data.providers && data.providers.openai_endpoint_models && data.providers.openai_endpoint_models.length > 0) {
2766
+ const openai_endpoint_models = data.providers.openai_endpoint_models;
2767
+ console.log('Found OpenAI Endpoint models:', openai_endpoint_models.length);
2768
+
2769
+ // Add provider information to each model
2770
+ openai_endpoint_models.forEach(model => {
2771
+ formattedModels.push({
2772
+ value: model.value,
2773
+ label: model.label,
2774
+ provider: 'OPENAI_ENDPOINT' // Ensure provider field is added
2775
+ });
2776
+ });
2777
+ }
2778
+
2764
2779
  // Update the global modelOptions array
2765
2780
  modelOptions = formattedModels;
2766
2781
  console.log('Final modelOptions:', modelOptions.length, 'models');
@@ -3673,6 +3688,18 @@
3673
3688
 
3674
3689
  return false;
3675
3690
  }
3691
+
3692
+ if (providerUpper === 'OPENAI_ENDPOINT') {
3693
+ if (model.provider && model.provider.toUpperCase() === 'OPENAI_ENDPOINT') {
3694
+ return true;
3695
+ }
3696
+
3697
+ if (model.label && model.label.toLowerCase().includes('custom')) {
3698
+ return true;
3699
+ }
3700
+
3701
+ return false;
3702
+ }
3676
3703
 
3677
3704
  // For other providers, use standard matching
3678
3705
  if (model.provider) {
@@ -85,6 +85,29 @@ window.socket = (function() {
85
85
  }
86
86
  });
87
87
 
88
+ // Add handler for search engine selection events
89
+ socket.on('search_engine_selected', (data) => {
90
+ console.log('Received search_engine_selected event:', data);
91
+ if (data && data.engine) {
92
+ const engineName = data.engine;
93
+ const resultCount = data.result_count || 0;
94
+
95
+ // Add to log panel
96
+ if (typeof window.addConsoleLog === 'function') {
97
+ // Format engine name - capitalize first letter
98
+ const displayEngineName = engineName.charAt(0).toUpperCase() + engineName.slice(1);
99
+ const message = `Search engine selected: ${displayEngineName} (found ${resultCount} results)`;
100
+ window.addConsoleLog(message, 'info', {
101
+ type: 'info',
102
+ phase: 'engine_selected',
103
+ engine: engineName,
104
+ result_count: resultCount,
105
+ is_engine_selection: true
106
+ });
107
+ }
108
+ }
109
+ });
110
+
88
111
  socket.on('disconnect', (reason) => {
89
112
  console.log('Socket disconnected:', reason);
90
113
 
@@ -237,6 +260,30 @@ window.socket = (function() {
237
260
  });
238
261
  }
239
262
 
263
+ // Handle special engine selection events
264
+ if (data.event === 'search_engine_selected' || (data.engine && data.result_count !== undefined)) {
265
+ // Extract engine information
266
+ const engineName = data.engine || 'unknown';
267
+ const resultCount = data.result_count || 0;
268
+
269
+ // Log the event
270
+ console.log(`Search engine selected: ${engineName} (found ${resultCount} results)`);
271
+
272
+ // Add to log panel as an info message with special metadata
273
+ if (typeof window.addConsoleLog === 'function') {
274
+ // Format engine name - capitalize first letter
275
+ const displayEngineName = engineName.charAt(0).toUpperCase() + engineName.slice(1);
276
+ const message = `Search engine selected: ${displayEngineName} (found ${resultCount} results)`;
277
+ window.addConsoleLog(message, 'info', {
278
+ type: 'info',
279
+ phase: 'engine_selected',
280
+ engine: engineName,
281
+ result_count: resultCount,
282
+ is_engine_selection: true
283
+ });
284
+ }
285
+ }
286
+
240
287
  // Initialize message tracking if not exists
241
288
  window._processedSocketMessages = window._processedSocketMessages || new Map();
242
289
 
@@ -0,0 +1,38 @@
1
+ """
2
+ Default search engine configurations.
3
+ This file can be used to initialize the search engine configurations.
4
+ """
5
+
6
+ def get_default_elasticsearch_config():
7
+ """
8
+ Returns the default Elasticsearch search engine configuration.
9
+
10
+ Returns:
11
+ dict: Default configuration for Elasticsearch search engine
12
+ """
13
+ return {
14
+ "module_path": "local_deep_research.web_search_engines.engines.search_engine_elasticsearch",
15
+ "class_name": "ElasticsearchSearchEngine",
16
+ "requires_llm": True,
17
+ "default_params": {
18
+ "hosts": ["http://172.16.4.131:9200"],
19
+ "index_name": "sample_documents",
20
+ "highlight_fields": ["content", "title"],
21
+ "search_fields": ["content", "title"],
22
+ },
23
+ "description": "Search engine for Elasticsearch databases",
24
+ "strengths": "Efficient for searching document collections and structured data",
25
+ "weaknesses": "Requires an Elasticsearch instance and properly indexed data",
26
+ "reliability": "High, depending on your Elasticsearch setup",
27
+ }
28
+
29
+ def get_default_search_engine_configs():
30
+ """
31
+ Returns a dictionary of default search engine configurations.
32
+
33
+ Returns:
34
+ dict: Dictionary of default search engine configurations
35
+ """
36
+ return {
37
+ "elasticsearch": get_default_elasticsearch_config(),
38
+ }
@@ -1,6 +1,7 @@
1
- import logging
2
1
  from typing import Any, Dict, List, Optional
3
2
 
3
+ from loguru import logger
4
+
4
5
  from ...utilities.db_utils import get_db_setting
5
6
  from ...web.services.socket_service import emit_socket_event
6
7
  from ..search_engine_base import BaseSearchEngine
@@ -8,10 +9,6 @@ from ..search_engine_factory import create_search_engine
8
9
  from ..search_engines_config import search_config
9
10
  from .search_engine_wikipedia import WikipediaSearchEngine
10
11
 
11
- # Setup logging
12
- logging.basicConfig(level=logging.INFO)
13
- logger = logging.getLogger(__name__)
14
-
15
12
 
16
13
  class MetaSearchEngine(BaseSearchEngine):
17
14
  """
@@ -120,6 +117,8 @@ class MetaSearchEngine(BaseSearchEngine):
120
117
  def analyze_query(self, query: str) -> List[str]:
121
118
  """
122
119
  Analyze the query to determine the best search engines to use.
120
+ Prioritizes SearXNG for general queries, but selects specialized engines
121
+ for domain-specific queries (e.g., scientific papers, code).
123
122
 
124
123
  Args:
125
124
  query: The search query
@@ -128,10 +127,57 @@ class MetaSearchEngine(BaseSearchEngine):
128
127
  List of search engine names sorted by suitability
129
128
  """
130
129
  try:
131
- # Check if the LLM is available to help select engines
132
- if not self.llm:
130
+ # First check if this is a specialized query that should use specific engines
131
+ specialized_domains = {
132
+ "scientific paper": ["arxiv", "pubmed", "wikipedia"],
133
+ "medical research": ["pubmed", "searxng"],
134
+ "clinical": ["pubmed", "searxng"],
135
+ "github": ["github", "searxng"],
136
+ "repository": ["github", "searxng"],
137
+ "code": ["github", "searxng"],
138
+ "programming": ["github", "searxng"],
139
+ }
140
+
141
+ # Quick heuristic check for specialized queries
142
+ query_lower = query.lower()
143
+ for term, engines in specialized_domains.items():
144
+ if term in query_lower:
145
+ valid_engines = []
146
+ for engine in engines:
147
+ if engine in self.available_engines:
148
+ valid_engines.append(engine)
149
+
150
+ if valid_engines:
151
+ logger.info(
152
+ f"Detected specialized query type: {term}, using engines: {valid_engines}"
153
+ )
154
+ return valid_engines
155
+
156
+ # For searches containing "arxiv", prioritize the arxiv engine
157
+ if "arxiv" in query_lower and "arxiv" in self.available_engines:
158
+ return ["arxiv"] + [e for e in self.available_engines if e != "arxiv"]
159
+
160
+ # For searches containing "pubmed", prioritize the pubmed engine
161
+ if "pubmed" in query_lower and "pubmed" in self.available_engines:
162
+ return ["pubmed"] + [e for e in self.available_engines if e != "pubmed"]
163
+
164
+ # Check if SearXNG is available and prioritize it for general queries
165
+ if "searxng" in self.available_engines:
166
+ # For general queries, return SearXNG first followed by reliability-ordered engines
167
+ engines_without_searxng = [
168
+ e for e in self.available_engines if e != "searxng"
169
+ ]
170
+ reliability_sorted = sorted(
171
+ engines_without_searxng,
172
+ key=lambda x: search_config().get(x, {}).get("reliability", 0),
173
+ reverse=True,
174
+ )
175
+ return ["searxng"] + reliability_sorted
176
+
177
+ # If LLM is not available or SearXNG is not available, fall back to reliability
178
+ if not self.llm or "searxng" not in self.available_engines:
133
179
  logger.warning(
134
- "No LLM available for query analysis, using default engines"
180
+ "No LLM available or SearXNG not available, using reliability-based engines"
135
181
  )
136
182
  # Return engines sorted by reliability
137
183
  return sorted(
@@ -157,8 +203,8 @@ class MetaSearchEngine(BaseSearchEngine):
157
203
  engines_info.append(
158
204
  f"- {engine_name}: {description}\n Strengths: {strengths}\n Weaknesses: {weaknesses}"
159
205
  )
160
- except KeyError as e:
161
- logger.error(f"Missing key for engine {engine_name}: {str(e)}")
206
+ except KeyError:
207
+ logger.exception(f"Missing key for engine {engine_name}")
162
208
 
163
209
  # Only proceed if we have engines available to choose from
164
210
  if not engines_info:
@@ -171,6 +217,7 @@ class MetaSearchEngine(BaseSearchEngine):
171
217
  reverse=True,
172
218
  )
173
219
 
220
+ # Use a stronger prompt that emphasizes SearXNG preference for general queries
174
221
  prompt = f"""You are a search query analyst. Consider this search query:
175
222
 
176
223
  QUERY: {query}
@@ -179,11 +226,17 @@ I have these search engines available:
179
226
  {chr(10).join(engines_info)}
180
227
 
181
228
  Determine which search engines would be most appropriate for answering this query.
182
- First analyze the nature of the query (factual, scientific, code-related, etc.)
183
- Then select the 1-3 most appropriate search engines for this type of query.
229
+ First analyze the nature of the query: Is it factual, scientific, code-related, medical, etc.?
230
+
231
+ IMPORTANT GUIDELINES:
232
+ - Use SearXNG for most general queries as it combines results from multiple search engines
233
+ - For academic/scientific searches, prefer arXiv
234
+ - For medical research, prefer PubMed
235
+ - For code repositories and programming, prefer GitHub
236
+ - For every other query type, SearXNG is usually the best option
184
237
 
185
- Output ONLY a comma-separated list of the search engine names in order of most appropriate to least appropriate.
186
- Example output: wikipedia,arxiv,github"""
238
+ Output ONLY a comma-separated list of 1-3 search engine names in order of most appropriate to least appropriate.
239
+ Example output: searxng,wikipedia,brave"""
187
240
 
188
241
  # Get analysis from LLM
189
242
  response = self.llm.invoke(prompt)
@@ -201,7 +254,16 @@ Example output: wikipedia,arxiv,github"""
201
254
  if cleaned_name in self.available_engines:
202
255
  valid_engines.append(cleaned_name)
203
256
 
204
- # If no valid engines were returned, use default order based on reliability
257
+ # If SearXNG is available but not selected by the LLM, add it as a fallback
258
+ if "searxng" in self.available_engines and "searxng" not in valid_engines:
259
+ # Add it as the last option if the LLM selected others
260
+ if valid_engines:
261
+ valid_engines.append("searxng")
262
+ # Use it as the first option if no valid engines were selected
263
+ else:
264
+ valid_engines = ["searxng"]
265
+
266
+ # If still no valid engines, use reliability-based ordering
205
267
  if not valid_engines:
206
268
  valid_engines = sorted(
207
269
  self.available_engines,
@@ -210,14 +272,21 @@ Example output: wikipedia,arxiv,github"""
210
272
  )
211
273
 
212
274
  return valid_engines
213
- except Exception as e:
214
- logger.error(f"Error analyzing query with LLM: {str(e)}")
215
- # Fall back to reliability-based ordering
216
- return sorted(
217
- self.available_engines,
218
- key=lambda x: search_config().get(x, {}).get("reliability", 0),
219
- reverse=True,
220
- )
275
+ except Exception:
276
+ logger.exception("Error analyzing query with LLM")
277
+ # Fall back to SearXNG if available, then reliability-based ordering
278
+ if "searxng" in self.available_engines:
279
+ return ["searxng"] + sorted(
280
+ [e for e in self.available_engines if e != "searxng"],
281
+ key=lambda x: search_config().get(x, {}).get("reliability", 0),
282
+ reverse=True,
283
+ )
284
+ else:
285
+ return sorted(
286
+ self.available_engines,
287
+ key=lambda x: search_config().get(x, {}).get("reliability", 0),
288
+ reverse=True,
289
+ )
221
290
 
222
291
  def _get_previews(self, query: str) -> List[Dict[str, Any]]:
223
292
  """
@@ -277,10 +346,8 @@ Example output: wikipedia,arxiv,github"""
277
346
  "search_engine_selected",
278
347
  {"engine": engine_name, "result_count": len(previews)},
279
348
  )
280
- except Exception as socket_error:
281
- logger.error(
282
- f"Socket emit error (non-critical): {str(socket_error)}"
283
- )
349
+ except Exception:
350
+ logger.exception("Socket emit error (non-critical)")
284
351
 
285
352
  return previews
286
353
 
@@ -289,7 +356,7 @@ Example output: wikipedia,arxiv,github"""
289
356
 
290
357
  except Exception as e:
291
358
  error_msg = f"Error getting previews from {engine_name}: {str(e)}"
292
- logger.error(error_msg)
359
+ logger.exception(error_msg)
293
360
  all_errors.append(error_msg)
294
361
 
295
362
  # If we reach here, all engines failed, use fallback
@@ -325,9 +392,9 @@ Example output: wikipedia,arxiv,github"""
325
392
  try:
326
393
  logger.info(f"Using {self._selected_engine_name} to get full content")
327
394
  return self._selected_engine._get_full_content(relevant_items)
328
- except Exception as e:
329
- logger.error(
330
- f"Error getting full content from {self._selected_engine_name}: {str(e)}"
395
+ except Exception:
396
+ logger.exception(
397
+ f"Error getting full content from {self._selected_engine_name}"
331
398
  )
332
399
  # Fall back to returning relevant items without full content
333
400
  return relevant_items
@@ -354,8 +421,8 @@ Example output: wikipedia,arxiv,github"""
354
421
  common_params["max_filtered_results"] = self.max_filtered_results
355
422
 
356
423
  engine = create_search_engine(engine_name, **common_params)
357
- except Exception as e:
358
- logger.error(f"Error creating engine instance for {engine_name}: {str(e)}")
424
+ except Exception:
425
+ logger.exception(f"Error creating engine instance for {engine_name}")
359
426
  return None
360
427
 
361
428
  if engine:
@@ -1,14 +1,15 @@
1
- import logging
2
1
  from typing import Any, Dict, List, Optional
3
2
 
4
3
  import arxiv
5
4
  from langchain_core.language_models import BaseLLM
5
+ from loguru import logger
6
6
 
7
+ from ...advanced_search_system.filters.journal_reputation_filter import (
8
+ JournalReputationFilter,
9
+ )
7
10
  from ...config import search_config
8
11
  from ..search_engine_base import BaseSearchEngine
9
12
 
10
- logger = logging.getLogger(__name__)
11
-
12
13
 
13
14
  class ArXivSearchEngine(BaseSearchEngine):
14
15
  """arXiv search engine implementation with two-phase approach"""
@@ -37,9 +38,22 @@ class ArXivSearchEngine(BaseSearchEngine):
37
38
  llm: Language model for relevance filtering
38
39
  max_filtered_results: Maximum number of results to keep after filtering
39
40
  """
41
+ # Initialize the journal reputation filter if needed.
42
+ content_filters = []
43
+ journal_filter = JournalReputationFilter.create_default(
44
+ model=llm, engine_name="arxiv"
45
+ )
46
+ if journal_filter is not None:
47
+ content_filters.append(journal_filter)
48
+
40
49
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
41
50
  super().__init__(
42
- llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
51
+ llm=llm,
52
+ max_filtered_results=max_filtered_results,
53
+ max_results=max_results,
54
+ # We deliberately do this filtering after relevancy checks,
55
+ # because it is potentially quite slow.
56
+ content_filters=content_filters,
43
57
  )
44
58
  self.max_results = max(self.max_results, 25)
45
59
  self.sort_by = sort_by
@@ -133,14 +147,15 @@ class ArXivSearchEngine(BaseSearchEngine):
133
147
  if paper.published
134
148
  else None
135
149
  ),
150
+ "journal_ref": paper.journal_ref,
136
151
  }
137
152
 
138
153
  previews.append(preview)
139
154
 
140
155
  return previews
141
156
 
142
- except Exception as e:
143
- logger.error(f"Error getting arXiv previews: {e}")
157
+ except Exception:
158
+ logger.exception("Error getting arXiv previews")
144
159
  return []
145
160
 
146
161
  def _get_full_content(
@@ -203,7 +218,6 @@ class ArXivSearchEngine(BaseSearchEngine):
203
218
  "categories": paper.categories,
204
219
  "summary": paper.summary, # Full summary
205
220
  "comment": paper.comment,
206
- "journal_ref": paper.journal_ref,
207
221
  "doi": paper.doi,
208
222
  }
209
223
  )
@@ -263,17 +277,17 @@ class ArXivSearchEngine(BaseSearchEngine):
263
277
  "Successfully extracted text from PDF using pdfplumber"
264
278
  )
265
279
  except (ImportError, Exception) as e2:
266
- logger.error(
280
+ logger.exception(
267
281
  f"PDF text extraction failed: {str(e1)}, then {str(e2)}"
268
282
  )
269
283
  logger.error(
270
284
  "Using paper summary as content instead"
271
285
  )
272
- except Exception as e:
273
- logger.error(f"Error extracting text from PDF: {e}")
286
+ except Exception:
287
+ logger.exception("Error extracting text from PDF")
274
288
  logger.error("Using paper summary as content instead")
275
- except Exception as e:
276
- logger.error(f"Error downloading paper {paper.title}: {e}")
289
+ except Exception:
290
+ logger.exception(f"Error downloading paper {paper.title}")
277
291
  result["pdf_path"] = None
278
292
  pdf_count -= 1 # Decrement counter if download fails
279
293
  elif (
@@ -349,6 +363,7 @@ class ArXivSearchEngine(BaseSearchEngine):
349
363
  "authors": [
350
364
  author.name for author in paper.authors[:3]
351
365
  ], # First 3 authors
366
+ "journal_ref": paper.journal_ref,
352
367
  }
353
368
 
354
369
  # Add full content if not in snippet-only mode
@@ -375,7 +390,6 @@ class ArXivSearchEngine(BaseSearchEngine):
375
390
  "categories": paper.categories,
376
391
  "summary": paper.summary, # Full summary
377
392
  "comment": paper.comment,
378
- "journal_ref": paper.journal_ref,
379
393
  "doi": paper.doi,
380
394
  "content": paper.summary, # Use summary as content
381
395
  "full_content": paper.summary, # For consistency
@@ -388,13 +402,13 @@ class ArXivSearchEngine(BaseSearchEngine):
388
402
  # Download the paper
389
403
  paper_path = paper.download_pdf(dirpath=self.download_dir)
390
404
  result["pdf_path"] = str(paper_path)
391
- except Exception as e:
392
- logger.error(f"Error downloading paper: {e}")
405
+ except Exception:
406
+ logger.exception("Error downloading paper")
393
407
 
394
408
  return result
395
409
 
396
- except Exception as e:
397
- logger.error(f"Error getting paper details: {e}")
410
+ except Exception:
411
+ logger.exception("Error getting paper details")
398
412
  return {}
399
413
 
400
414
  def search_by_author(
@@ -64,11 +64,16 @@ class BraveSearchEngine(BaseSearchEngine):
64
64
  "russian": "ru",
65
65
  }
66
66
 
67
- # Get API key
68
- brave_api_key = api_key or os.getenv("BRAVE_API_KEY")
67
+ # Get API key - check params, env vars, or database
68
+ from ...utilities.db_utils import get_db_setting
69
+
70
+ brave_api_key = api_key
71
+ if not brave_api_key:
72
+ brave_api_key = get_db_setting("search.engine.web.brave.api_key")
73
+
69
74
  if not brave_api_key:
70
75
  raise ValueError(
71
- "BRAVE_API_KEY not found. Please provide api_key or set the BRAVE_API_KEY environment variable."
76
+ "Brave API key not found. Please provide api_key parameter, set the BRAVE_API_KEY environment variable, or set it in the UI settings."
72
77
  )
73
78
 
74
79
  # Get language code