local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +23 -22
- local_deep_research/__main__.py +16 -0
- local_deep_research/advanced_search_system/__init__.py +7 -0
- local_deep_research/advanced_search_system/filters/__init__.py +8 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
- local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
- local_deep_research/advanced_search_system/findings/repository.py +452 -0
- local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
- local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
- local_deep_research/advanced_search_system/questions/__init__.py +1 -0
- local_deep_research/advanced_search_system/questions/base_question.py +64 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
- local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
- local_deep_research/advanced_search_system/tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
- local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
- local_deep_research/api/__init__.py +5 -5
- local_deep_research/api/research_functions.py +154 -160
- local_deep_research/app.py +8 -0
- local_deep_research/citation_handler.py +25 -16
- local_deep_research/{config.py → config/config_files.py} +102 -110
- local_deep_research/config/llm_config.py +472 -0
- local_deep_research/config/search_config.py +77 -0
- local_deep_research/defaults/__init__.py +10 -5
- local_deep_research/defaults/main.toml +2 -2
- local_deep_research/defaults/search_engines.toml +60 -34
- local_deep_research/main.py +121 -19
- local_deep_research/migrate_db.py +147 -0
- local_deep_research/report_generator.py +87 -45
- local_deep_research/search_system.py +153 -283
- local_deep_research/setup_data_dir.py +35 -0
- local_deep_research/test_migration.py +178 -0
- local_deep_research/utilities/__init__.py +0 -0
- local_deep_research/utilities/db_utils.py +49 -0
- local_deep_research/{utilties → utilities}/enums.py +2 -2
- local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
- local_deep_research/utilities/search_utilities.py +242 -0
- local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
- local_deep_research/web/__init__.py +0 -1
- local_deep_research/web/app.py +86 -1709
- local_deep_research/web/app_factory.py +289 -0
- local_deep_research/web/database/README.md +70 -0
- local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
- local_deep_research/web/database/migrations.py +447 -0
- local_deep_research/web/database/models.py +117 -0
- local_deep_research/web/database/schema_upgrade.py +107 -0
- local_deep_research/web/models/database.py +294 -0
- local_deep_research/web/models/settings.py +94 -0
- local_deep_research/web/routes/api_routes.py +559 -0
- local_deep_research/web/routes/history_routes.py +354 -0
- local_deep_research/web/routes/research_routes.py +715 -0
- local_deep_research/web/routes/settings_routes.py +1583 -0
- local_deep_research/web/services/research_service.py +947 -0
- local_deep_research/web/services/resource_service.py +149 -0
- local_deep_research/web/services/settings_manager.py +669 -0
- local_deep_research/web/services/settings_service.py +187 -0
- local_deep_research/web/services/socket_service.py +210 -0
- local_deep_research/web/static/css/custom_dropdown.css +277 -0
- local_deep_research/web/static/css/settings.css +1223 -0
- local_deep_research/web/static/css/styles.css +525 -48
- local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
- local_deep_research/web/static/js/components/detail.js +348 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
- local_deep_research/web/static/js/components/fallback/ui.js +215 -0
- local_deep_research/web/static/js/components/history.js +487 -0
- local_deep_research/web/static/js/components/logpanel.js +949 -0
- local_deep_research/web/static/js/components/progress.js +1107 -0
- local_deep_research/web/static/js/components/research.js +1865 -0
- local_deep_research/web/static/js/components/results.js +766 -0
- local_deep_research/web/static/js/components/settings.js +3981 -0
- local_deep_research/web/static/js/components/settings_sync.js +106 -0
- local_deep_research/web/static/js/main.js +226 -0
- local_deep_research/web/static/js/services/api.js +253 -0
- local_deep_research/web/static/js/services/audio.js +31 -0
- local_deep_research/web/static/js/services/formatting.js +119 -0
- local_deep_research/web/static/js/services/pdf.js +622 -0
- local_deep_research/web/static/js/services/socket.js +882 -0
- local_deep_research/web/static/js/services/ui.js +546 -0
- local_deep_research/web/templates/base.html +72 -0
- local_deep_research/web/templates/components/custom_dropdown.html +47 -0
- local_deep_research/web/templates/components/log_panel.html +32 -0
- local_deep_research/web/templates/components/mobile_nav.html +22 -0
- local_deep_research/web/templates/components/settings_form.html +299 -0
- local_deep_research/web/templates/components/sidebar.html +21 -0
- local_deep_research/web/templates/pages/details.html +73 -0
- local_deep_research/web/templates/pages/history.html +51 -0
- local_deep_research/web/templates/pages/progress.html +57 -0
- local_deep_research/web/templates/pages/research.html +139 -0
- local_deep_research/web/templates/pages/results.html +59 -0
- local_deep_research/web/templates/settings_dashboard.html +78 -192
- local_deep_research/web/utils/__init__.py +0 -0
- local_deep_research/web/utils/formatters.py +76 -0
- local_deep_research/web_search_engines/engines/full_search.py +18 -16
- local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
- local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
- local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
- local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
- local_deep_research/web_search_engines/search_engine_base.py +174 -99
- local_deep_research/web_search_engines/search_engine_factory.py +192 -102
- local_deep_research/web_search_engines/search_engines_config.py +22 -15
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
- local_deep_research-0.2.2.dist-info/RECORD +135 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
- local_deep_research/defaults/llm_config.py +0 -338
- local_deep_research/utilties/search_utilities.py +0 -114
- local_deep_research/web/static/js/app.js +0 -3763
- local_deep_research/web/templates/api_keys_config.html +0 -82
- local_deep_research/web/templates/collections_config.html +0 -90
- local_deep_research/web/templates/index.html +0 -348
- local_deep_research/web/templates/llm_config.html +0 -120
- local_deep_research/web/templates/main_config.html +0 -89
- local_deep_research/web/templates/search_engines_config.html +0 -154
- local_deep_research/web/templates/settings.html +0 -519
- local_deep_research-0.1.26.dist-info/RECORD +0 -61
- local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
- /local_deep_research/{utilties → config}/__init__.py +0 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,42 +1,45 @@
|
|
1
|
-
import requests
|
2
1
|
import logging
|
3
2
|
import os
|
4
|
-
from typing import Dict, List, Any, Optional
|
5
|
-
from langchain_core.language_models import BaseLLM
|
6
3
|
import time
|
7
|
-
import
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
|
6
|
+
import requests
|
7
|
+
from langchain_core.language_models import BaseLLM
|
8
8
|
|
9
|
-
from
|
10
|
-
from
|
11
|
-
import
|
9
|
+
from ...config import search_config
|
10
|
+
from ..search_engine_base import BaseSearchEngine
|
11
|
+
from .full_search import FullSearchResults
|
12
12
|
|
13
13
|
# Setup logging
|
14
14
|
logging.basicConfig(level=logging.INFO)
|
15
15
|
logger = logging.getLogger(__name__)
|
16
16
|
|
17
|
+
|
17
18
|
class SearXNGSearchEngine(BaseSearchEngine):
|
18
19
|
"""
|
19
20
|
SearXNG search engine implementation that requires an instance URL provided via
|
20
21
|
environment variable or configuration. Designed for ethical usage with proper
|
21
22
|
rate limiting and single-instance approach.
|
22
23
|
"""
|
23
|
-
|
24
|
-
def __init__(
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
max_results: int = 15,
|
28
|
+
instance_url: Optional[str] = None, # Can be None if using env var
|
29
|
+
categories: Optional[List[str]] = None,
|
30
|
+
engines: Optional[List[str]] = None,
|
31
|
+
language: str = "en",
|
32
|
+
safe_search: int = 1,
|
33
|
+
time_range: Optional[str] = None,
|
34
|
+
delay_between_requests: float = 0.0,
|
35
|
+
llm: Optional[BaseLLM] = None,
|
36
|
+
max_filtered_results: Optional[int] = None,
|
37
|
+
include_full_content: bool = True,
|
38
|
+
api_key: Optional[str] = None,
|
39
|
+
): # API key is actually the instance URL
|
37
40
|
"""
|
38
41
|
Initialize the SearXNG search engine with ethical usage patterns.
|
39
|
-
|
42
|
+
|
40
43
|
Args:
|
41
44
|
max_results: Maximum number of search results
|
42
45
|
instance_url: URL of your SearXNG instance (preferably self-hosted)
|
@@ -53,48 +56,62 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
53
56
|
"""
|
54
57
|
|
55
58
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
56
|
-
super().__init__(
|
57
|
-
|
59
|
+
super().__init__(
|
60
|
+
llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
|
61
|
+
)
|
62
|
+
|
58
63
|
# Get instance URL from various sources in priority order:
|
59
64
|
# 1. api_key parameter (which is actually the instance URL)
|
60
65
|
# 2. SEARXNG_INSTANCE environment variable
|
61
66
|
# 3. instance_url parameter
|
62
67
|
# 4. Default to None, which will disable the engine
|
63
|
-
self.instance_url = api_key or os.getenv("SEARXNG_INSTANCE") or instance_url
|
64
|
-
|
68
|
+
self.instance_url = api_key or os.getenv("SEARXNG_INSTANCE") or instance_url or "http://localhost:8080"
|
69
|
+
|
65
70
|
# Add debug logging for instance URL
|
66
|
-
logger.info(
|
67
|
-
|
71
|
+
logger.info(
|
72
|
+
f"SearXNG init - Instance URL sources: api_key={api_key}, env={os.getenv('SEARXNG_INSTANCE')}, param={instance_url}"
|
73
|
+
)
|
74
|
+
|
68
75
|
# Validate and normalize the instance URL if provided
|
69
76
|
if self.instance_url:
|
70
|
-
self.instance_url = self.instance_url.rstrip(
|
77
|
+
self.instance_url = self.instance_url.rstrip("/")
|
71
78
|
self.is_available = True
|
72
79
|
logger.info(f"SearXNG initialized with instance URL: {self.instance_url}")
|
73
80
|
else:
|
74
81
|
self.is_available = False
|
75
|
-
logger.error(
|
76
|
-
|
77
|
-
|
82
|
+
logger.error(
|
83
|
+
"No SearXNG instance URL provided. The engine is disabled. "
|
84
|
+
"Set SEARXNG_INSTANCE environment variable or provide instance_url parameter."
|
85
|
+
)
|
86
|
+
|
78
87
|
# Add debug logging for all parameters
|
79
|
-
logger.info(
|
80
|
-
|
81
|
-
|
88
|
+
logger.info(
|
89
|
+
f"SearXNG init params: max_results={max_results}, language={language}, "
|
90
|
+
f"max_filtered_results={max_filtered_results}, is_available={self.is_available}"
|
91
|
+
)
|
92
|
+
|
82
93
|
self.max_results = max_results
|
83
94
|
self.categories = categories or ["general"]
|
84
95
|
self.engines = engines
|
85
96
|
self.language = language
|
86
97
|
self.safe_search = safe_search
|
87
98
|
self.time_range = time_range
|
88
|
-
|
89
|
-
self.delay_between_requests = float(
|
90
|
-
|
99
|
+
|
100
|
+
self.delay_between_requests = float(
|
101
|
+
os.getenv("SEARXNG_DELAY", delay_between_requests)
|
102
|
+
)
|
103
|
+
|
91
104
|
self.include_full_content = include_full_content
|
92
|
-
|
105
|
+
|
93
106
|
if self.is_available:
|
94
107
|
self.search_url = f"{self.instance_url}/search"
|
95
|
-
logger.info(
|
96
|
-
|
97
|
-
|
108
|
+
logger.info(
|
109
|
+
f"SearXNG engine initialized with instance: {self.instance_url}"
|
110
|
+
)
|
111
|
+
logger.info(
|
112
|
+
f"Rate limiting set to {self.delay_between_requests} seconds between requests"
|
113
|
+
)
|
114
|
+
|
98
115
|
self.full_search = FullSearchResults(
|
99
116
|
llm=llm,
|
100
117
|
web_search=self,
|
@@ -102,56 +119,63 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
102
119
|
max_results=max_results,
|
103
120
|
region="wt-wt",
|
104
121
|
time="y",
|
105
|
-
safesearch=
|
122
|
+
safesearch=(
|
123
|
+
"Moderate"
|
124
|
+
if safe_search == 1
|
125
|
+
else "Off" if safe_search == 0 else "Strict"
|
126
|
+
),
|
106
127
|
)
|
107
|
-
|
128
|
+
|
108
129
|
self.last_request_time = 0
|
109
|
-
|
130
|
+
|
110
131
|
def _respect_rate_limit(self):
|
111
132
|
"""Apply self-imposed rate limiting between requests"""
|
112
133
|
current_time = time.time()
|
113
134
|
time_since_last_request = current_time - self.last_request_time
|
114
|
-
|
115
135
|
|
116
136
|
if time_since_last_request < self.delay_between_requests:
|
117
137
|
wait_time = self.delay_between_requests - time_since_last_request
|
118
138
|
logger.info(f"Rate limiting: waiting {wait_time:.2f} seconds")
|
119
139
|
time.sleep(wait_time)
|
120
|
-
|
140
|
+
|
121
141
|
self.last_request_time = time.time()
|
122
|
-
|
142
|
+
|
123
143
|
def _get_search_results(self, query: str) -> List[Dict[str, Any]]:
|
124
144
|
"""
|
125
145
|
Get search results from SearXNG with ethical rate limiting.
|
126
|
-
|
146
|
+
|
127
147
|
Args:
|
128
148
|
query: The search query
|
129
|
-
|
149
|
+
|
130
150
|
Returns:
|
131
151
|
List of search results from SearXNG
|
132
152
|
"""
|
133
153
|
if not self.is_available:
|
134
|
-
logger.error(
|
154
|
+
logger.error(
|
155
|
+
"SearXNG engine is disabled (no instance URL provided) - cannot run search"
|
156
|
+
)
|
135
157
|
return []
|
136
|
-
|
158
|
+
|
137
159
|
logger.info(f"SearXNG running search for query: {query}")
|
138
|
-
|
160
|
+
|
139
161
|
try:
|
140
162
|
self._respect_rate_limit()
|
141
|
-
|
163
|
+
|
142
164
|
initial_headers = {
|
143
165
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
144
166
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
145
|
-
"Accept-Language": "en-US,en;q=0.9"
|
167
|
+
"Accept-Language": "en-US,en;q=0.9",
|
146
168
|
}
|
147
|
-
|
169
|
+
|
148
170
|
try:
|
149
|
-
initial_response = requests.get(
|
171
|
+
initial_response = requests.get(
|
172
|
+
self.instance_url, headers=initial_headers, timeout=10
|
173
|
+
)
|
150
174
|
cookies = initial_response.cookies
|
151
175
|
except Exception as e:
|
152
176
|
logger.warning(f"Failed to get initial cookies: {e}")
|
153
177
|
cookies = None
|
154
|
-
|
178
|
+
|
155
179
|
params = {
|
156
180
|
"q": query,
|
157
181
|
"categories": ",".join(self.categories),
|
@@ -159,15 +183,15 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
159
183
|
"format": "html", # Use HTML format instead of JSON
|
160
184
|
"pageno": 1,
|
161
185
|
"safesearch": self.safe_search,
|
162
|
-
"count": self.max_results
|
186
|
+
"count": self.max_results,
|
163
187
|
}
|
164
|
-
|
188
|
+
|
165
189
|
if self.engines:
|
166
190
|
params["engines"] = ",".join(self.engines)
|
167
|
-
|
191
|
+
|
168
192
|
if self.time_range:
|
169
193
|
params["time_range"] = self.time_range
|
170
|
-
|
194
|
+
|
171
195
|
# Browser-like headers
|
172
196
|
headers = {
|
173
197
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
@@ -175,91 +199,105 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
175
199
|
"Accept-Language": "en-US,en;q=0.9",
|
176
200
|
"Referer": self.instance_url + "/",
|
177
201
|
"Connection": "keep-alive",
|
178
|
-
"Upgrade-Insecure-Requests": "1"
|
202
|
+
"Upgrade-Insecure-Requests": "1",
|
179
203
|
}
|
180
|
-
|
204
|
+
|
181
205
|
logger.info(f"Sending request to SearXNG instance at {self.instance_url}")
|
182
206
|
response = requests.get(
|
183
207
|
self.search_url,
|
184
208
|
params=params,
|
185
209
|
headers=headers,
|
186
210
|
cookies=cookies,
|
187
|
-
timeout=15
|
211
|
+
timeout=15,
|
188
212
|
)
|
189
|
-
|
213
|
+
|
190
214
|
if response.status_code == 200:
|
191
215
|
try:
|
192
216
|
from bs4 import BeautifulSoup
|
193
|
-
|
194
|
-
soup = BeautifulSoup(response.text,
|
217
|
+
|
218
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
195
219
|
results = []
|
196
|
-
|
197
|
-
result_elements = soup.select(
|
198
|
-
|
220
|
+
|
221
|
+
result_elements = soup.select(".result-item")
|
222
|
+
|
199
223
|
if not result_elements:
|
200
|
-
result_elements = soup.select(
|
201
|
-
|
224
|
+
result_elements = soup.select(".result")
|
225
|
+
|
202
226
|
if not result_elements:
|
203
|
-
result_elements = soup.select(
|
204
|
-
|
227
|
+
result_elements = soup.select("article")
|
228
|
+
|
205
229
|
if not result_elements:
|
206
|
-
logger.debug(
|
230
|
+
logger.debug(
|
231
|
+
f"Classes found in HTML: {[c['class'] for c in soup.select('[class]') if 'class' in c.attrs][:10]}"
|
232
|
+
)
|
207
233
|
result_elements = soup.select('div[id^="result"]')
|
208
|
-
|
234
|
+
|
209
235
|
logger.info(f"Found {len(result_elements)} search result elements")
|
210
|
-
|
236
|
+
|
211
237
|
for idx, result_element in enumerate(result_elements):
|
212
238
|
if idx >= self.max_results:
|
213
239
|
break
|
214
|
-
|
240
|
+
|
215
241
|
title_element = (
|
216
|
-
result_element.select_one(
|
217
|
-
result_element.select_one(
|
218
|
-
result_element.select_one(
|
219
|
-
result_element.select_one(
|
242
|
+
result_element.select_one(".result-title")
|
243
|
+
or result_element.select_one(".title")
|
244
|
+
or result_element.select_one("h3")
|
245
|
+
or result_element.select_one("a[href]")
|
220
246
|
)
|
221
|
-
|
247
|
+
|
222
248
|
url_element = (
|
223
|
-
result_element.select_one(
|
224
|
-
result_element.select_one(
|
225
|
-
result_element.select_one(
|
249
|
+
result_element.select_one(".result-url")
|
250
|
+
or result_element.select_one(".url")
|
251
|
+
or result_element.select_one("a[href]")
|
226
252
|
)
|
227
|
-
|
253
|
+
|
228
254
|
content_element = (
|
229
|
-
result_element.select_one(
|
230
|
-
result_element.select_one(
|
231
|
-
result_element.select_one(
|
232
|
-
result_element.select_one(
|
255
|
+
result_element.select_one(".result-content")
|
256
|
+
or result_element.select_one(".content")
|
257
|
+
or result_element.select_one(".snippet")
|
258
|
+
or result_element.select_one("p")
|
259
|
+
)
|
260
|
+
|
261
|
+
title = (
|
262
|
+
title_element.get_text(strip=True) if title_element else ""
|
233
263
|
)
|
234
|
-
|
235
|
-
title = title_element.get_text(strip=True) if title_element else ""
|
236
|
-
|
264
|
+
|
237
265
|
url = ""
|
238
|
-
if url_element and url_element.has_attr(
|
239
|
-
url = url_element[
|
266
|
+
if url_element and url_element.has_attr("href"):
|
267
|
+
url = url_element["href"]
|
240
268
|
elif url_element:
|
241
269
|
url = url_element.get_text(strip=True)
|
242
|
-
|
243
|
-
content =
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
270
|
+
|
271
|
+
content = (
|
272
|
+
content_element.get_text(strip=True)
|
273
|
+
if content_element
|
274
|
+
else ""
|
275
|
+
)
|
276
|
+
|
277
|
+
if not url and title_element and title_element.has_attr("href"):
|
278
|
+
url = title_element["href"]
|
279
|
+
|
280
|
+
logger.debug(
|
281
|
+
f"Extracted result {idx}: title={title[:30]}..., url={url[:30]}..., content={content[:30]}..."
|
282
|
+
)
|
283
|
+
|
250
284
|
# Add to results if we have at least a title or URL
|
251
285
|
if title or url:
|
252
|
-
results.append(
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
286
|
+
results.append(
|
287
|
+
{
|
288
|
+
"title": title,
|
289
|
+
"url": url,
|
290
|
+
"content": content,
|
291
|
+
"engine": "searxng",
|
292
|
+
"category": "general",
|
293
|
+
}
|
294
|
+
)
|
295
|
+
|
296
|
+
logger.info(
|
297
|
+
f"SearXNG returned {len(results)} results from HTML parsing"
|
298
|
+
)
|
261
299
|
return results
|
262
|
-
|
300
|
+
|
263
301
|
except ImportError:
|
264
302
|
logger.error("BeautifulSoup not available for HTML parsing")
|
265
303
|
return []
|
@@ -269,123 +307,132 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
269
307
|
else:
|
270
308
|
logger.error(f"SearXNG returned status code {response.status_code}")
|
271
309
|
return []
|
272
|
-
|
310
|
+
|
273
311
|
except Exception as e:
|
274
312
|
logger.error(f"Error getting SearXNG results: {e}")
|
275
313
|
return []
|
276
|
-
|
314
|
+
|
277
315
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
278
316
|
"""
|
279
317
|
Get preview information for SearXNG search results.
|
280
|
-
|
318
|
+
|
281
319
|
Args:
|
282
320
|
query: The search query
|
283
|
-
|
321
|
+
|
284
322
|
Returns:
|
285
323
|
List of preview dictionaries
|
286
324
|
"""
|
287
325
|
if not self.is_available:
|
288
326
|
logger.warning("SearXNG engine is disabled (no instance URL provided)")
|
289
327
|
return []
|
290
|
-
|
328
|
+
|
291
329
|
logger.info(f"Getting SearXNG previews for query: {query}")
|
292
|
-
|
330
|
+
|
293
331
|
results = self._get_search_results(query)
|
294
|
-
|
332
|
+
|
295
333
|
if not results:
|
296
334
|
logger.warning(f"No SearXNG results found for query: {query}")
|
297
335
|
return []
|
298
|
-
|
336
|
+
|
299
337
|
previews = []
|
300
338
|
for i, result in enumerate(results):
|
301
339
|
title = result.get("title", "")
|
302
340
|
url = result.get("url", "")
|
303
341
|
content = result.get("content", "")
|
304
|
-
|
342
|
+
|
305
343
|
preview = {
|
306
344
|
"id": url or f"searxng-result-{i}",
|
307
345
|
"title": title,
|
308
346
|
"link": url,
|
309
347
|
"snippet": content,
|
310
348
|
"engine": result.get("engine", ""),
|
311
|
-
"category": result.get("category", "")
|
349
|
+
"category": result.get("category", ""),
|
312
350
|
}
|
313
|
-
|
351
|
+
|
314
352
|
previews.append(preview)
|
315
|
-
|
353
|
+
|
316
354
|
return previews
|
317
|
-
|
318
|
-
def _get_full_content(
|
355
|
+
|
356
|
+
def _get_full_content(
|
357
|
+
self, relevant_items: List[Dict[str, Any]]
|
358
|
+
) -> List[Dict[str, Any]]:
|
319
359
|
"""
|
320
360
|
Get full content for the relevant search results.
|
321
|
-
|
361
|
+
|
322
362
|
Args:
|
323
363
|
relevant_items: List of relevant preview dictionaries
|
324
|
-
|
364
|
+
|
325
365
|
Returns:
|
326
366
|
List of result dictionaries with full content
|
327
367
|
"""
|
328
368
|
if not self.is_available:
|
329
369
|
return relevant_items
|
330
|
-
|
331
|
-
if
|
370
|
+
|
371
|
+
if (
|
372
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
373
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
374
|
+
):
|
332
375
|
logger.info("Snippet-only mode, skipping full content retrieval")
|
333
376
|
return relevant_items
|
334
|
-
|
377
|
+
|
335
378
|
logger.info("Retrieving full webpage content")
|
336
|
-
|
379
|
+
|
337
380
|
try:
|
338
381
|
results_with_content = self.full_search._get_full_content(relevant_items)
|
339
382
|
return results_with_content
|
340
|
-
|
383
|
+
|
341
384
|
except Exception as e:
|
342
385
|
logger.error(f"Error retrieving full content: {e}")
|
343
386
|
return relevant_items
|
344
|
-
|
387
|
+
|
345
388
|
def invoke(self, query: str) -> List[Dict[str, Any]]:
|
346
389
|
"""Compatibility method for LangChain tools"""
|
347
390
|
return self.run(query)
|
348
|
-
|
349
|
-
def results(
|
391
|
+
|
392
|
+
def results(
|
393
|
+
self, query: str, max_results: Optional[int] = None
|
394
|
+
) -> List[Dict[str, Any]]:
|
350
395
|
"""
|
351
396
|
Get search results in a format compatible with other search engines.
|
352
|
-
|
397
|
+
|
353
398
|
Args:
|
354
399
|
query: The search query
|
355
400
|
max_results: Optional override for maximum results
|
356
|
-
|
401
|
+
|
357
402
|
Returns:
|
358
403
|
List of search result dictionaries
|
359
404
|
"""
|
360
405
|
if not self.is_available:
|
361
406
|
return []
|
362
|
-
|
407
|
+
|
363
408
|
original_max_results = self.max_results
|
364
|
-
|
409
|
+
|
365
410
|
try:
|
366
411
|
if max_results is not None:
|
367
412
|
self.max_results = max_results
|
368
|
-
|
413
|
+
|
369
414
|
results = self._get_search_results(query)
|
370
|
-
|
415
|
+
|
371
416
|
formatted_results = []
|
372
417
|
for result in results:
|
373
|
-
formatted_results.append(
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
418
|
+
formatted_results.append(
|
419
|
+
{
|
420
|
+
"title": result.get("title", ""),
|
421
|
+
"link": result.get("url", ""),
|
422
|
+
"snippet": result.get("content", ""),
|
423
|
+
}
|
424
|
+
)
|
425
|
+
|
379
426
|
return formatted_results
|
380
|
-
|
427
|
+
|
381
428
|
finally:
|
382
429
|
self.max_results = original_max_results
|
383
|
-
|
430
|
+
|
384
431
|
@staticmethod
|
385
432
|
def get_self_hosting_instructions() -> str:
|
386
433
|
"""
|
387
434
|
Get instructions for self-hosting a SearXNG instance.
|
388
|
-
|
435
|
+
|
389
436
|
Returns:
|
390
437
|
String with installation instructions
|
391
438
|
"""
|
@@ -441,15 +488,20 @@ https://searxng.github.io/searxng/admin/installation.html
|
|
441
488
|
Override BaseSearchEngine run method to add SearXNG-specific error handling.
|
442
489
|
"""
|
443
490
|
if not self.is_available:
|
444
|
-
logger.error(
|
491
|
+
logger.error(
|
492
|
+
"SearXNG run method called but engine is not available (missing instance URL)"
|
493
|
+
)
|
445
494
|
return []
|
446
|
-
|
447
|
-
logger.info(f"SearXNG
|
448
|
-
|
495
|
+
|
496
|
+
logger.info(f"SearXNG search engine running with query: '{query}'")
|
497
|
+
logger.info(f"SearXNG instance URL: {self.instance_url}")
|
498
|
+
|
449
499
|
try:
|
450
500
|
# Call the parent class's run method
|
451
|
-
|
501
|
+
results = super().run(query)
|
502
|
+
logger.info(f"SearXNG search completed with {len(results)} results")
|
503
|
+
return results
|
452
504
|
except Exception as e:
|
453
505
|
logger.error(f"Error in SearXNG run method: {str(e)}")
|
454
506
|
# Return empty results on error
|
455
|
-
return []
|
507
|
+
return []
|