local-deep-research 0.1.0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/defaults/main.toml +5 -0
- local_deep_research/search_system.py +98 -38
- local_deep_research/web/app.py +721 -169
- local_deep_research/web/static/css/styles.css +270 -5
- local_deep_research/web/static/js/app.js +2247 -562
- local_deep_research/web/templates/index.html +37 -1
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +454 -0
- local_deep_research/web_search_engines/search_engine_factory.py +20 -1
- {local_deep_research-0.1.0.dist-info → local_deep_research-0.1.12.dist-info}/METADATA +24 -6
- {local_deep_research-0.1.0.dist-info → local_deep_research-0.1.12.dist-info}/RECORD +14 -13
- {local_deep_research-0.1.0.dist-info → local_deep_research-0.1.12.dist-info}/WHEEL +1 -1
- {local_deep_research-0.1.0.dist-info → local_deep_research-0.1.12.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.1.0.dist-info → local_deep_research-0.1.12.dist-info/licenses}/LICENSE +0 -0
- {local_deep_research-0.1.0.dist-info → local_deep_research-0.1.12.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,6 @@
|
|
8
8
|
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
|
9
9
|
<!-- Change to CDN version that works in browsers -->
|
10
10
|
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/github-dark.min.css">
|
11
|
-
<link rel="icon" type="image/png" href="{{ url_for('static', filename='favicon.ico') }}">
|
12
11
|
</head>
|
13
12
|
<body>
|
14
13
|
<div class="app-container">
|
@@ -119,6 +118,9 @@
|
|
119
118
|
<i class="fas fa-stop-circle"></i> Terminate Research
|
120
119
|
</button>
|
121
120
|
<div id="error-message" class="error-message" style="display: none;"></div>
|
121
|
+
<button id="try-again-btn" class="btn btn-primary" style="display: none; margin-top: 15px;">
|
122
|
+
<i class="fas fa-redo"></i> Try Again
|
123
|
+
</button>
|
122
124
|
</div>
|
123
125
|
</div>
|
124
126
|
</div>
|
@@ -214,6 +216,31 @@
|
|
214
216
|
</div>
|
215
217
|
</div>
|
216
218
|
</div>
|
219
|
+
|
220
|
+
<!-- Collapsible Log Panel -->
|
221
|
+
<div class="collapsible-log-panel">
|
222
|
+
<div class="log-panel-header" id="log-panel-toggle">
|
223
|
+
<i class="fas fa-chevron-down toggle-icon"></i>
|
224
|
+
<span>Research Logs</span>
|
225
|
+
<span class="log-indicator" id="log-indicator">0</span>
|
226
|
+
</div>
|
227
|
+
<div class="log-panel-content" id="log-panel-content">
|
228
|
+
<div class="log-controls">
|
229
|
+
<div class="log-filter">
|
230
|
+
<div class="filter-buttons">
|
231
|
+
<button class="small-btn selected" onclick="window.filterLogsByType('all')">All</button>
|
232
|
+
<button class="small-btn" onclick="window.filterLogsByType('milestone')">Milestones</button>
|
233
|
+
<button class="small-btn" onclick="window.filterLogsByType('info')">Info</button>
|
234
|
+
<button class="small-btn" onclick="window.filterLogsByType('error')">Errors</button>
|
235
|
+
</div>
|
236
|
+
</div>
|
237
|
+
</div>
|
238
|
+
<div class="console-log" id="console-log-container">
|
239
|
+
<!-- Logs will be added here dynamically -->
|
240
|
+
<div class="empty-log-message">No logs yet. Research logs will appear here as they occur.</div>
|
241
|
+
</div>
|
242
|
+
</div>
|
243
|
+
</div>
|
217
244
|
</main>
|
218
245
|
</div>
|
219
246
|
|
@@ -308,5 +335,14 @@
|
|
308
335
|
window.html2canvas_noSandbox = true;
|
309
336
|
}
|
310
337
|
</script>
|
338
|
+
|
339
|
+
<!-- Add a template for console log entries -->
|
340
|
+
<template id="console-log-entry-template">
|
341
|
+
<div class="console-log-entry">
|
342
|
+
<span class="log-timestamp"></span>
|
343
|
+
<span class="log-badge"></span>
|
344
|
+
<span class="log-message"></span>
|
345
|
+
</div>
|
346
|
+
</template>
|
311
347
|
</body>
|
312
348
|
</html>
|
@@ -0,0 +1,454 @@
|
|
1
|
+
import requests
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from typing import Dict, List, Any, Optional
|
5
|
+
from langchain_core.language_models import BaseLLM
|
6
|
+
import time
|
7
|
+
import json
|
8
|
+
|
9
|
+
from web_search_engines.search_engine_base import BaseSearchEngine
|
10
|
+
from web_search_engines.engines.full_search import FullSearchResults
|
11
|
+
import config
|
12
|
+
|
13
|
+
# Setup logging
|
14
|
+
logging.basicConfig(level=logging.INFO)
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
class SearXNGSearchEngine(BaseSearchEngine):
|
18
|
+
"""
|
19
|
+
SearXNG search engine implementation that requires an instance URL provided via
|
20
|
+
environment variable or configuration. Designed for ethical usage with proper
|
21
|
+
rate limiting and single-instance approach.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(self,
|
25
|
+
max_results: int = 15,
|
26
|
+
instance_url: Optional[str] = None, # Can be None if using env var
|
27
|
+
categories: Optional[List[str]] = None,
|
28
|
+
engines: Optional[List[str]] = None,
|
29
|
+
language: str = "en",
|
30
|
+
safe_search: int = 1,
|
31
|
+
time_range: Optional[str] = None,
|
32
|
+
delay_between_requests: float = 2.0,
|
33
|
+
llm: Optional[BaseLLM] = None,
|
34
|
+
max_filtered_results: Optional[int] = None,
|
35
|
+
include_full_content: bool = True,
|
36
|
+
api_key: Optional[str] = None): # API key is actually the instance URL
|
37
|
+
"""
|
38
|
+
Initialize the SearXNG search engine with ethical usage patterns.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
max_results: Maximum number of search results
|
42
|
+
instance_url: URL of your SearXNG instance (preferably self-hosted)
|
43
|
+
categories: List of SearXNG categories to search in (general, images, videos, news, etc.)
|
44
|
+
engines: List of engines to use (google, bing, duckduckgo, etc.)
|
45
|
+
language: Language code for search results
|
46
|
+
safe_search: Safe search level (0=off, 1=moderate, 2=strict)
|
47
|
+
time_range: Time range for results (day, week, month, year)
|
48
|
+
delay_between_requests: Seconds to wait between requests
|
49
|
+
llm: Language model for relevance filtering
|
50
|
+
max_filtered_results: Maximum number of results to keep after filtering
|
51
|
+
include_full_content: Whether to include full webpage content in results
|
52
|
+
api_key: Alternative way to provide instance URL (takes precedence over instance_url)
|
53
|
+
"""
|
54
|
+
# Initialize the BaseSearchEngine with the LLM and max_filtered_results
|
55
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
56
|
+
|
57
|
+
# Get instance URL from various sources in priority order:
|
58
|
+
# 1. api_key parameter (which is actually the instance URL)
|
59
|
+
# 2. SEARXNG_INSTANCE environment variable
|
60
|
+
# 3. instance_url parameter
|
61
|
+
# 4. Default to None, which will disable the engine
|
62
|
+
self.instance_url = api_key or os.getenv("SEARXNG_INSTANCE") or instance_url
|
63
|
+
|
64
|
+
# Add debug logging for instance URL
|
65
|
+
logger.info(f"SearXNG init - Instance URL sources: api_key={api_key}, env={os.getenv('SEARXNG_INSTANCE')}, param={instance_url}")
|
66
|
+
|
67
|
+
# Validate and normalize the instance URL if provided
|
68
|
+
if self.instance_url:
|
69
|
+
self.instance_url = self.instance_url.rstrip('/')
|
70
|
+
self.is_available = True
|
71
|
+
logger.info(f"SearXNG initialized with instance URL: {self.instance_url}")
|
72
|
+
else:
|
73
|
+
self.is_available = False
|
74
|
+
logger.error("No SearXNG instance URL provided. The engine is disabled. "
|
75
|
+
"Set SEARXNG_INSTANCE environment variable or provide instance_url parameter.")
|
76
|
+
|
77
|
+
# Add debug logging for all parameters
|
78
|
+
logger.info(f"SearXNG init params: max_results={max_results}, language={language}, "
|
79
|
+
f"max_filtered_results={max_filtered_results}, is_available={self.is_available}")
|
80
|
+
|
81
|
+
self.max_results = max_results
|
82
|
+
self.categories = categories or ["general"]
|
83
|
+
self.engines = engines
|
84
|
+
self.language = language
|
85
|
+
self.safe_search = safe_search
|
86
|
+
self.time_range = time_range
|
87
|
+
|
88
|
+
self.delay_between_requests = float(os.getenv("SEARXNG_DELAY", delay_between_requests))
|
89
|
+
|
90
|
+
self.include_full_content = include_full_content
|
91
|
+
|
92
|
+
if self.is_available:
|
93
|
+
self.search_url = f"{self.instance_url}/search"
|
94
|
+
logger.info(f"SearXNG engine initialized with instance: {self.instance_url}")
|
95
|
+
logger.info(f"Rate limiting set to {self.delay_between_requests} seconds between requests")
|
96
|
+
|
97
|
+
self.full_search = FullSearchResults(
|
98
|
+
llm=llm,
|
99
|
+
web_search=self,
|
100
|
+
language=language,
|
101
|
+
max_results=max_results,
|
102
|
+
region="wt-wt",
|
103
|
+
time="y",
|
104
|
+
safesearch="Moderate" if safe_search == 1 else "Off" if safe_search == 0 else "Strict"
|
105
|
+
)
|
106
|
+
|
107
|
+
self.last_request_time = 0
|
108
|
+
|
109
|
+
def _respect_rate_limit(self):
|
110
|
+
"""Apply self-imposed rate limiting between requests"""
|
111
|
+
current_time = time.time()
|
112
|
+
time_since_last_request = current_time - self.last_request_time
|
113
|
+
|
114
|
+
|
115
|
+
if time_since_last_request < self.delay_between_requests:
|
116
|
+
wait_time = self.delay_between_requests - time_since_last_request
|
117
|
+
logger.info(f"Rate limiting: waiting {wait_time:.2f} seconds")
|
118
|
+
time.sleep(wait_time)
|
119
|
+
|
120
|
+
self.last_request_time = time.time()
|
121
|
+
|
122
|
+
def _get_search_results(self, query: str) -> List[Dict[str, Any]]:
|
123
|
+
"""
|
124
|
+
Get search results from SearXNG with ethical rate limiting.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
query: The search query
|
128
|
+
|
129
|
+
Returns:
|
130
|
+
List of search results from SearXNG
|
131
|
+
"""
|
132
|
+
if not self.is_available:
|
133
|
+
logger.error("SearXNG engine is disabled (no instance URL provided) - cannot run search")
|
134
|
+
return []
|
135
|
+
|
136
|
+
logger.info(f"SearXNG running search for query: {query}")
|
137
|
+
|
138
|
+
try:
|
139
|
+
self._respect_rate_limit()
|
140
|
+
|
141
|
+
initial_headers = {
|
142
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
143
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
144
|
+
"Accept-Language": "en-US,en;q=0.9"
|
145
|
+
}
|
146
|
+
|
147
|
+
try:
|
148
|
+
initial_response = requests.get(self.instance_url, headers=initial_headers, timeout=10)
|
149
|
+
cookies = initial_response.cookies
|
150
|
+
except Exception as e:
|
151
|
+
logger.warning(f"Failed to get initial cookies: {e}")
|
152
|
+
cookies = None
|
153
|
+
|
154
|
+
params = {
|
155
|
+
"q": query,
|
156
|
+
"categories": ",".join(self.categories),
|
157
|
+
"language": self.language,
|
158
|
+
"format": "html", # Use HTML format instead of JSON
|
159
|
+
"pageno": 1,
|
160
|
+
"safesearch": self.safe_search,
|
161
|
+
"count": self.max_results
|
162
|
+
}
|
163
|
+
|
164
|
+
if self.engines:
|
165
|
+
params["engines"] = ",".join(self.engines)
|
166
|
+
|
167
|
+
if self.time_range:
|
168
|
+
params["time_range"] = self.time_range
|
169
|
+
|
170
|
+
# Browser-like headers
|
171
|
+
headers = {
|
172
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
173
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
174
|
+
"Accept-Language": "en-US,en;q=0.9",
|
175
|
+
"Referer": self.instance_url + "/",
|
176
|
+
"Connection": "keep-alive",
|
177
|
+
"Upgrade-Insecure-Requests": "1"
|
178
|
+
}
|
179
|
+
|
180
|
+
logger.info(f"Sending request to SearXNG instance at {self.instance_url}")
|
181
|
+
response = requests.get(
|
182
|
+
self.search_url,
|
183
|
+
params=params,
|
184
|
+
headers=headers,
|
185
|
+
cookies=cookies,
|
186
|
+
timeout=15
|
187
|
+
)
|
188
|
+
|
189
|
+
if response.status_code == 200:
|
190
|
+
try:
|
191
|
+
from bs4 import BeautifulSoup
|
192
|
+
|
193
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
194
|
+
results = []
|
195
|
+
|
196
|
+
result_elements = soup.select('.result-item')
|
197
|
+
|
198
|
+
if not result_elements:
|
199
|
+
result_elements = soup.select('.result')
|
200
|
+
|
201
|
+
if not result_elements:
|
202
|
+
result_elements = soup.select('article')
|
203
|
+
|
204
|
+
if not result_elements:
|
205
|
+
logger.debug(f"Classes found in HTML: {[c['class'] for c in soup.select('[class]') if 'class' in c.attrs][:10]}")
|
206
|
+
result_elements = soup.select('div[id^="result"]')
|
207
|
+
|
208
|
+
logger.info(f"Found {len(result_elements)} search result elements")
|
209
|
+
|
210
|
+
for idx, result_element in enumerate(result_elements):
|
211
|
+
if idx >= self.max_results:
|
212
|
+
break
|
213
|
+
|
214
|
+
title_element = (
|
215
|
+
result_element.select_one('.result-title') or
|
216
|
+
result_element.select_one('.title') or
|
217
|
+
result_element.select_one('h3') or
|
218
|
+
result_element.select_one('a[href]')
|
219
|
+
)
|
220
|
+
|
221
|
+
url_element = (
|
222
|
+
result_element.select_one('.result-url') or
|
223
|
+
result_element.select_one('.url') or
|
224
|
+
result_element.select_one('a[href]')
|
225
|
+
)
|
226
|
+
|
227
|
+
content_element = (
|
228
|
+
result_element.select_one('.result-content') or
|
229
|
+
result_element.select_one('.content') or
|
230
|
+
result_element.select_one('.snippet') or
|
231
|
+
result_element.select_one('p')
|
232
|
+
)
|
233
|
+
|
234
|
+
title = title_element.get_text(strip=True) if title_element else ""
|
235
|
+
|
236
|
+
url = ""
|
237
|
+
if url_element and url_element.has_attr('href'):
|
238
|
+
url = url_element['href']
|
239
|
+
elif url_element:
|
240
|
+
url = url_element.get_text(strip=True)
|
241
|
+
|
242
|
+
content = content_element.get_text(strip=True) if content_element else ""
|
243
|
+
|
244
|
+
if not url and title_element and title_element.has_attr('href'):
|
245
|
+
url = title_element['href']
|
246
|
+
|
247
|
+
logger.debug(f"Extracted result {idx}: title={title[:30]}..., url={url[:30]}..., content={content[:30]}...")
|
248
|
+
|
249
|
+
# Add to results if we have at least a title or URL
|
250
|
+
if title or url:
|
251
|
+
results.append({
|
252
|
+
"title": title,
|
253
|
+
"url": url,
|
254
|
+
"content": content,
|
255
|
+
"engine": "searxng",
|
256
|
+
"category": "general"
|
257
|
+
})
|
258
|
+
|
259
|
+
logger.info(f"SearXNG returned {len(results)} results from HTML parsing")
|
260
|
+
return results
|
261
|
+
|
262
|
+
except ImportError:
|
263
|
+
logger.error("BeautifulSoup not available for HTML parsing")
|
264
|
+
return []
|
265
|
+
except Exception as e:
|
266
|
+
logger.error(f"Error parsing HTML results: {str(e)}")
|
267
|
+
return []
|
268
|
+
else:
|
269
|
+
logger.error(f"SearXNG returned status code {response.status_code}")
|
270
|
+
return []
|
271
|
+
|
272
|
+
except Exception as e:
|
273
|
+
logger.error(f"Error getting SearXNG results: {e}")
|
274
|
+
return []
|
275
|
+
|
276
|
+
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
277
|
+
"""
|
278
|
+
Get preview information for SearXNG search results.
|
279
|
+
|
280
|
+
Args:
|
281
|
+
query: The search query
|
282
|
+
|
283
|
+
Returns:
|
284
|
+
List of preview dictionaries
|
285
|
+
"""
|
286
|
+
if not self.is_available:
|
287
|
+
logger.warning("SearXNG engine is disabled (no instance URL provided)")
|
288
|
+
return []
|
289
|
+
|
290
|
+
logger.info(f"Getting SearXNG previews for query: {query}")
|
291
|
+
|
292
|
+
results = self._get_search_results(query)
|
293
|
+
|
294
|
+
if not results:
|
295
|
+
logger.warning(f"No SearXNG results found for query: {query}")
|
296
|
+
return []
|
297
|
+
|
298
|
+
previews = []
|
299
|
+
for i, result in enumerate(results):
|
300
|
+
title = result.get("title", "")
|
301
|
+
url = result.get("url", "")
|
302
|
+
content = result.get("content", "")
|
303
|
+
|
304
|
+
preview = {
|
305
|
+
"id": url or f"searxng-result-{i}",
|
306
|
+
"title": title,
|
307
|
+
"link": url,
|
308
|
+
"snippet": content,
|
309
|
+
"engine": result.get("engine", ""),
|
310
|
+
"category": result.get("category", "")
|
311
|
+
}
|
312
|
+
|
313
|
+
previews.append(preview)
|
314
|
+
|
315
|
+
return previews
|
316
|
+
|
317
|
+
def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
318
|
+
"""
|
319
|
+
Get full content for the relevant search results.
|
320
|
+
|
321
|
+
Args:
|
322
|
+
relevant_items: List of relevant preview dictionaries
|
323
|
+
|
324
|
+
Returns:
|
325
|
+
List of result dictionaries with full content
|
326
|
+
"""
|
327
|
+
if not self.is_available:
|
328
|
+
return relevant_items
|
329
|
+
|
330
|
+
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
331
|
+
logger.info("Snippet-only mode, skipping full content retrieval")
|
332
|
+
return relevant_items
|
333
|
+
|
334
|
+
logger.info("Retrieving full webpage content")
|
335
|
+
|
336
|
+
try:
|
337
|
+
results_with_content = self.full_search._get_full_content(relevant_items)
|
338
|
+
return results_with_content
|
339
|
+
|
340
|
+
except Exception as e:
|
341
|
+
logger.error(f"Error retrieving full content: {e}")
|
342
|
+
return relevant_items
|
343
|
+
|
344
|
+
def invoke(self, query: str) -> List[Dict[str, Any]]:
|
345
|
+
"""Compatibility method for LangChain tools"""
|
346
|
+
return self.run(query)
|
347
|
+
|
348
|
+
def results(self, query: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
349
|
+
"""
|
350
|
+
Get search results in a format compatible with other search engines.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
query: The search query
|
354
|
+
max_results: Optional override for maximum results
|
355
|
+
|
356
|
+
Returns:
|
357
|
+
List of search result dictionaries
|
358
|
+
"""
|
359
|
+
if not self.is_available:
|
360
|
+
return []
|
361
|
+
|
362
|
+
original_max_results = self.max_results
|
363
|
+
|
364
|
+
try:
|
365
|
+
if max_results is not None:
|
366
|
+
self.max_results = max_results
|
367
|
+
|
368
|
+
results = self._get_search_results(query)
|
369
|
+
|
370
|
+
formatted_results = []
|
371
|
+
for result in results:
|
372
|
+
formatted_results.append({
|
373
|
+
"title": result.get("title", ""),
|
374
|
+
"link": result.get("url", ""),
|
375
|
+
"snippet": result.get("content", "")
|
376
|
+
})
|
377
|
+
|
378
|
+
return formatted_results
|
379
|
+
|
380
|
+
finally:
|
381
|
+
self.max_results = original_max_results
|
382
|
+
|
383
|
+
@staticmethod
|
384
|
+
def get_self_hosting_instructions() -> str:
|
385
|
+
"""
|
386
|
+
Get instructions for self-hosting a SearXNG instance.
|
387
|
+
|
388
|
+
Returns:
|
389
|
+
String with installation instructions
|
390
|
+
"""
|
391
|
+
return """
|
392
|
+
# SearXNG Self-Hosting Instructions
|
393
|
+
|
394
|
+
The most ethical way to use SearXNG is to host your own instance. Here's how:
|
395
|
+
|
396
|
+
## Using Docker (easiest method)
|
397
|
+
|
398
|
+
1. Install Docker if you don't have it already
|
399
|
+
2. Run these commands:
|
400
|
+
|
401
|
+
```bash
|
402
|
+
# Pull the SearXNG Docker image
|
403
|
+
docker pull searxng/searxng
|
404
|
+
|
405
|
+
# Run SearXNG (will be available at http://localhost:8080)
|
406
|
+
docker run -d -p 8080:8080 --name searxng searxng/searxng
|
407
|
+
```
|
408
|
+
|
409
|
+
## Using Docker Compose (recommended for production)
|
410
|
+
|
411
|
+
1. Create a file named `docker-compose.yml` with the following content:
|
412
|
+
|
413
|
+
```yaml
|
414
|
+
version: '3'
|
415
|
+
services:
|
416
|
+
searxng:
|
417
|
+
container_name: searxng
|
418
|
+
image: searxng/searxng
|
419
|
+
ports:
|
420
|
+
- "8080:8080"
|
421
|
+
volumes:
|
422
|
+
- ./searxng:/etc/searxng
|
423
|
+
environment:
|
424
|
+
- SEARXNG_BASE_URL=http://localhost:8080/
|
425
|
+
restart: unless-stopped
|
426
|
+
```
|
427
|
+
|
428
|
+
2. Run with Docker Compose:
|
429
|
+
|
430
|
+
```bash
|
431
|
+
docker-compose up -d
|
432
|
+
```
|
433
|
+
|
434
|
+
For more detailed instructions and configuration options, visit:
|
435
|
+
https://searxng.github.io/searxng/admin/installation.html
|
436
|
+
"""
|
437
|
+
|
438
|
+
def run(self, query: str) -> List[Dict[str, Any]]:
|
439
|
+
"""
|
440
|
+
Override BaseSearchEngine run method to add SearXNG-specific error handling.
|
441
|
+
"""
|
442
|
+
if not self.is_available:
|
443
|
+
logger.error("SearXNG run method called but engine is not available (missing instance URL)")
|
444
|
+
return []
|
445
|
+
|
446
|
+
logger.info(f"SearXNG run method called with query: {query}")
|
447
|
+
|
448
|
+
try:
|
449
|
+
# Call the parent class's run method
|
450
|
+
return super().run(query)
|
451
|
+
except Exception as e:
|
452
|
+
logger.error(f"Error in SearXNG run method: {str(e)}")
|
453
|
+
# Return empty results on error
|
454
|
+
return []
|
@@ -230,4 +230,23 @@ def get_search(search_tool: str, llm_instance,
|
|
230
230
|
params["time_period"] = time_period
|
231
231
|
|
232
232
|
# Create and return the search engine
|
233
|
-
|
233
|
+
logger.info(f"Creating search engine for tool: {search_tool} with params: {params.keys()}")
|
234
|
+
engine = create_search_engine(search_tool, **params)
|
235
|
+
|
236
|
+
# Add debugging to check if engine is None
|
237
|
+
if engine is None:
|
238
|
+
logger.error(f"Failed to create search engine for {search_tool} - returned None")
|
239
|
+
else:
|
240
|
+
engine_type = type(engine).__name__
|
241
|
+
logger.info(f"Successfully created search engine of type: {engine_type}")
|
242
|
+
# Check if the engine has run method
|
243
|
+
if hasattr(engine, 'run'):
|
244
|
+
logger.info(f"Engine has 'run' method: {getattr(engine, 'run')}")
|
245
|
+
else:
|
246
|
+
logger.error(f"Engine does NOT have 'run' method!")
|
247
|
+
|
248
|
+
# For SearxNG, check availability flag
|
249
|
+
if hasattr(engine, 'is_available'):
|
250
|
+
logger.info(f"Engine availability flag: {engine.is_available}")
|
251
|
+
|
252
|
+
return engine
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: local-deep-research
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.12
|
4
4
|
Summary: AI-powered research assistant with deep, iterative analysis using LLMs and web searches
|
5
5
|
Author-email: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>, HashedViking <6432677+HashedViking@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -51,7 +51,7 @@ Requires-Dist: flask-socketio>=5.1.1
|
|
51
51
|
Requires-Dist: sqlalchemy>=1.4.23
|
52
52
|
Requires-Dist: wikipedia
|
53
53
|
Requires-Dist: arxiv>=1.4.3
|
54
|
-
Requires-Dist:
|
54
|
+
Requires-Dist: pypdf
|
55
55
|
Requires-Dist: sentence-transformers
|
56
56
|
Requires-Dist: faiss-cpu
|
57
57
|
Requires-Dist: pydantic>=2.0.0
|
@@ -59,6 +59,7 @@ Requires-Dist: pydantic-settings>=2.0.0
|
|
59
59
|
Requires-Dist: toml>=0.10.2
|
60
60
|
Requires-Dist: platformdirs>=3.0.0
|
61
61
|
Requires-Dist: dynaconf
|
62
|
+
Dynamic: license-file
|
62
63
|
|
63
64
|
# Local Deep Research
|
64
65
|
|
@@ -91,12 +92,13 @@ A powerful AI-powered research assistant that performs deep, iterative analysis
|
|
91
92
|
|
92
93
|
- 🌐 **Enhanced Search Integration**
|
93
94
|
- **Auto-selection of search sources**: The "auto" search engine intelligently analyzes your query and selects the most appropriate search engine based on the query content
|
95
|
+
- **SearXNG** integration for local web-search engine, great for privacy, no API key required (requires a searxng server)
|
94
96
|
- Wikipedia integration for factual knowledge
|
95
97
|
- arXiv integration for scientific papers and academic research
|
96
98
|
- PubMed integration for biomedical literature and medical research
|
97
99
|
- DuckDuckGo integration for web searches (may experience rate limiting)
|
98
100
|
- SerpAPI integration for Google search results (requires API key)
|
99
|
-
-
|
101
|
+
- Google Programmable Search Engine integration for custom search experiences (requires API key)
|
100
102
|
- The Guardian integration for news articles and journalism (requires API key)
|
101
103
|
- **Local RAG search for private documents** - search your own documents with vector embeddings
|
102
104
|
- Full webpage content retrieval
|
@@ -127,10 +129,10 @@ This example showcases the system's ability to perform multiple research iterati
|
|
127
129
|
|
128
130
|
1. Clone the repository:
|
129
131
|
```bash
|
130
|
-
git clone https://github.com/
|
132
|
+
git clone https://github.com/LearningCircuit/local-deep-research.git
|
131
133
|
cd local-deep-research
|
132
134
|
```
|
133
|
-
|
135
|
+
(experimental pip install with new features (but not so well tested yet): **pip install local-deep-research** )
|
134
136
|
2. Install dependencies:
|
135
137
|
```bash
|
136
138
|
pip install -r requirements.txt
|
@@ -147,6 +149,20 @@ ollama pull mistral # Default model - many work really well choose best for you
|
|
147
149
|
```bash
|
148
150
|
# Copy the template
|
149
151
|
cp .env.template .env
|
152
|
+
```
|
153
|
+
|
154
|
+
## Experimental install
|
155
|
+
```bash
|
156
|
+
#experimental pip install with new features (but not so well tested yet):
|
157
|
+
pip install local-deep-research
|
158
|
+
playwright install
|
159
|
+
ollama pull mistral
|
160
|
+
```
|
161
|
+
## Community & Support
|
162
|
+
|
163
|
+
We've just launched our [Discord server](https://discord.gg/2E6gYU2Z) for this project!
|
164
|
+
|
165
|
+
Our Discord server can help to exchange ideas about research approaches, discuss advanced usage patterns, and share other ideas.
|
150
166
|
|
151
167
|
# Edit .env with your API keys (if using cloud LLMs)
|
152
168
|
ANTHROPIC_API_KEY=your-api-key-here # For Claude
|
@@ -276,6 +292,7 @@ You can use local search in several ways:
|
|
276
292
|
The system supports multiple search engines that can be selected by changing the `search_tool` variable in `config.py`:
|
277
293
|
|
278
294
|
- **Auto** (`auto`): Intelligent search engine selector that analyzes your query and chooses the most appropriate source (Wikipedia, arXiv, local collections, etc.)
|
295
|
+
- **SearXNG** (`searxng`): Local web-search engine, great for privacy, no API key required (requires a searxng server)
|
279
296
|
- **Wikipedia** (`wiki`): Best for general knowledge, facts, and overview information
|
280
297
|
- **arXiv** (`arxiv`): Great for scientific and academic research, accessing preprints and papers
|
281
298
|
- **PubMed** (`pubmed`): Excellent for biomedical literature, medical research, and health information
|
@@ -307,6 +324,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|
307
324
|
- [DuckDuckGo](https://duckduckgo.com) for web search
|
308
325
|
- [The Guardian](https://www.theguardian.com/) for quality journalism
|
309
326
|
- [SerpAPI](https://serpapi.com) for Google search results (requires API key)
|
327
|
+
- [SearXNG](https://searxng.org/) for local web-search engine
|
310
328
|
- Built on [LangChain](https://github.com/hwchase17/langchain) framework
|
311
329
|
- Uses [justext](https://github.com/miso-belica/justext) for content extraction
|
312
330
|
- [Playwright](https://playwright.dev) for web content retrieval
|