local-deep-research 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +24 -0
- local_deep_research/citation_handler.py +113 -0
- local_deep_research/config.py +166 -0
- local_deep_research/defaults/__init__.py +44 -0
- local_deep_research/defaults/llm_config.py +269 -0
- local_deep_research/defaults/local_collections.toml +47 -0
- local_deep_research/defaults/main.toml +57 -0
- local_deep_research/defaults/search_engines.toml +244 -0
- local_deep_research/local_collections.py +141 -0
- local_deep_research/main.py +113 -0
- local_deep_research/report_generator.py +206 -0
- local_deep_research/search_system.py +241 -0
- local_deep_research/utilties/__init__.py +0 -0
- local_deep_research/utilties/enums.py +9 -0
- local_deep_research/utilties/llm_utils.py +116 -0
- local_deep_research/utilties/search_utilities.py +115 -0
- local_deep_research/utilties/setup_utils.py +6 -0
- local_deep_research/web/__init__.py +2 -0
- local_deep_research/web/app.py +1209 -0
- local_deep_research/web/static/css/styles.css +1008 -0
- local_deep_research/web/static/js/app.js +2078 -0
- local_deep_research/web/templates/api_keys_config.html +82 -0
- local_deep_research/web/templates/collections_config.html +90 -0
- local_deep_research/web/templates/index.html +312 -0
- local_deep_research/web/templates/llm_config.html +120 -0
- local_deep_research/web/templates/main_config.html +89 -0
- local_deep_research/web/templates/search_engines_config.html +154 -0
- local_deep_research/web/templates/settings.html +519 -0
- local_deep_research/web/templates/settings_dashboard.html +207 -0
- local_deep_research/web_search_engines/__init__.py +0 -0
- local_deep_research/web_search_engines/engines/__init__.py +0 -0
- local_deep_research/web_search_engines/engines/full_search.py +128 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
- local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
- local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
- local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
- local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
- local_deep_research/web_search_engines/full_search.py +254 -0
- local_deep_research/web_search_engines/search_engine_base.py +197 -0
- local_deep_research/web_search_engines/search_engine_factory.py +233 -0
- local_deep_research/web_search_engines/search_engines_config.py +54 -0
- local_deep_research-0.1.0.dist-info/LICENSE +21 -0
- local_deep_research-0.1.0.dist-info/METADATA +328 -0
- local_deep_research-0.1.0.dist-info/RECORD +56 -0
- local_deep_research-0.1.0.dist-info/WHEEL +5 -0
- local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
- local_deep_research-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,242 @@
|
|
1
|
+
import wikipedia
|
2
|
+
from typing import Dict, List, Any, Optional
|
3
|
+
from langchain_core.language_models import BaseLLM
|
4
|
+
import logging
|
5
|
+
|
6
|
+
from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
|
7
|
+
from local_deep_research import config
|
8
|
+
|
9
|
+
# Setup logging
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class WikipediaSearchEngine(BaseSearchEngine):
|
14
|
+
"""Wikipedia search engine implementation with two-phase approach"""
|
15
|
+
|
16
|
+
def __init__(self,
|
17
|
+
max_results: int = 10,
|
18
|
+
language: str = "en",
|
19
|
+
include_content: bool = True,
|
20
|
+
sentences: int = 5,
|
21
|
+
llm: Optional[BaseLLM] = None,
|
22
|
+
max_filtered_results: Optional[int] = None,
|
23
|
+
**kwargs):
|
24
|
+
"""
|
25
|
+
Initialize the Wikipedia search engine.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
max_results: Maximum number of search results
|
29
|
+
language: Language code for Wikipedia (e.g., 'en', 'fr', 'es')
|
30
|
+
include_content: Whether to include full page content in results
|
31
|
+
sentences: Number of sentences to include in summary
|
32
|
+
llm: Language model for relevance filtering
|
33
|
+
max_filtered_results: Maximum number of results to keep after filtering
|
34
|
+
**kwargs: Additional parameters (ignored but accepted for compatibility)
|
35
|
+
"""
|
36
|
+
# Initialize the BaseSearchEngine with the LLM and max_filtered_results
|
37
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
38
|
+
|
39
|
+
self.max_results = max_results
|
40
|
+
self.include_content = include_content
|
41
|
+
self.sentences = sentences
|
42
|
+
|
43
|
+
# Set the Wikipedia language
|
44
|
+
wikipedia.set_lang(language)
|
45
|
+
|
46
|
+
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
47
|
+
"""
|
48
|
+
Get preview information (titles and summaries) for Wikipedia pages.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
query: The search query
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
List of preview dictionaries
|
55
|
+
"""
|
56
|
+
logger.info(f"Getting Wikipedia page previews for query: {query}")
|
57
|
+
|
58
|
+
try:
|
59
|
+
# Get search results (just titles)
|
60
|
+
search_results = wikipedia.search(query, results=self.max_results)
|
61
|
+
|
62
|
+
logger.info(f"Found {len(search_results)} Wikipedia results: {search_results}")
|
63
|
+
|
64
|
+
if not search_results:
|
65
|
+
logger.info(f"No Wikipedia results found for query: {query}")
|
66
|
+
return []
|
67
|
+
|
68
|
+
# Create a cache for full pages (will be populated on-demand)
|
69
|
+
self._page_cache = {}
|
70
|
+
|
71
|
+
# Generate previews with summaries
|
72
|
+
previews = []
|
73
|
+
for title in search_results:
|
74
|
+
try:
|
75
|
+
# Get just the summary, with auto_suggest=False to be more precise
|
76
|
+
summary = None
|
77
|
+
try:
|
78
|
+
summary = wikipedia.summary(title, sentences=self.sentences, auto_suggest=False)
|
79
|
+
except wikipedia.exceptions.DisambiguationError as e:
|
80
|
+
# If disambiguation error, try the first option
|
81
|
+
if e.options and len(e.options) > 0:
|
82
|
+
logger.info(f"Disambiguation for '{title}', trying first option: {e.options[0]}")
|
83
|
+
try:
|
84
|
+
summary = wikipedia.summary(e.options[0], sentences=self.sentences, auto_suggest=False)
|
85
|
+
title = e.options[0] # Use the new title
|
86
|
+
except Exception as inner_e:
|
87
|
+
logger.error(f"Error with disambiguation option: {inner_e}")
|
88
|
+
continue
|
89
|
+
else:
|
90
|
+
logger.warning(f"Disambiguation with no options for '{title}'")
|
91
|
+
continue
|
92
|
+
|
93
|
+
if summary:
|
94
|
+
preview = {
|
95
|
+
"id": title, # Use title as ID
|
96
|
+
"title": title,
|
97
|
+
"snippet": summary,
|
98
|
+
"link": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
|
99
|
+
}
|
100
|
+
|
101
|
+
previews.append(preview)
|
102
|
+
|
103
|
+
except (wikipedia.exceptions.PageError,
|
104
|
+
wikipedia.exceptions.WikipediaException) as e:
|
105
|
+
# Skip pages with errors
|
106
|
+
logger.warning(f"Error getting summary for '{title}': {e}")
|
107
|
+
continue
|
108
|
+
except Exception as e:
|
109
|
+
logger.error(f"Unexpected error for '{title}': {e}")
|
110
|
+
continue
|
111
|
+
|
112
|
+
logger.info(f"Successfully created {len(previews)} previews from Wikipedia")
|
113
|
+
return previews
|
114
|
+
|
115
|
+
except Exception as e:
|
116
|
+
logger.error(f"Error getting Wikipedia previews: {e}")
|
117
|
+
return []
|
118
|
+
|
119
|
+
def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
120
|
+
"""
|
121
|
+
Get full content for the relevant Wikipedia pages.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
relevant_items: List of relevant preview dictionaries
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
List of result dictionaries with full content
|
128
|
+
"""
|
129
|
+
# Check if we should add full content
|
130
|
+
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
131
|
+
logger.info("Snippet-only mode, skipping full content retrieval")
|
132
|
+
return relevant_items
|
133
|
+
|
134
|
+
logger.info(f"Getting full content for {len(relevant_items)} relevant Wikipedia pages")
|
135
|
+
|
136
|
+
results = []
|
137
|
+
for item in relevant_items:
|
138
|
+
title = item.get("id") # Title stored as ID
|
139
|
+
|
140
|
+
if not title:
|
141
|
+
results.append(item)
|
142
|
+
continue
|
143
|
+
|
144
|
+
try:
|
145
|
+
# Get the full page
|
146
|
+
page = wikipedia.page(title, auto_suggest=False)
|
147
|
+
|
148
|
+
# Create a full result with all information
|
149
|
+
result = {
|
150
|
+
"title": page.title,
|
151
|
+
"link": page.url,
|
152
|
+
"snippet": item.get("snippet", "") # Keep existing snippet
|
153
|
+
}
|
154
|
+
|
155
|
+
# Add additional information
|
156
|
+
result["content"] = page.content
|
157
|
+
result["full_content"] = page.content
|
158
|
+
result["categories"] = page.categories
|
159
|
+
result["references"] = page.references
|
160
|
+
result["links"] = page.links
|
161
|
+
result["images"] = page.images
|
162
|
+
result["sections"] = page.sections
|
163
|
+
|
164
|
+
results.append(result)
|
165
|
+
|
166
|
+
except (wikipedia.exceptions.DisambiguationError,
|
167
|
+
wikipedia.exceptions.PageError,
|
168
|
+
wikipedia.exceptions.WikipediaException) as e:
|
169
|
+
# If error, use the preview
|
170
|
+
logger.warning(f"Error getting full content for '{title}': {e}")
|
171
|
+
results.append(item)
|
172
|
+
except Exception as e:
|
173
|
+
logger.error(f"Unexpected error getting full content for '{title}': {e}")
|
174
|
+
results.append(item)
|
175
|
+
|
176
|
+
return results
|
177
|
+
|
178
|
+
def get_summary(self, title: str, sentences: Optional[int] = None) -> str:
|
179
|
+
"""
|
180
|
+
Get a summary of a specific Wikipedia page.
|
181
|
+
|
182
|
+
Args:
|
183
|
+
title: Title of the Wikipedia page
|
184
|
+
sentences: Number of sentences to include (defaults to self.sentences)
|
185
|
+
|
186
|
+
Returns:
|
187
|
+
Summary of the page
|
188
|
+
"""
|
189
|
+
sentences = sentences or self.sentences
|
190
|
+
try:
|
191
|
+
return wikipedia.summary(title, sentences=sentences, auto_suggest=False)
|
192
|
+
except wikipedia.exceptions.DisambiguationError as e:
|
193
|
+
if e.options and len(e.options) > 0:
|
194
|
+
return wikipedia.summary(e.options[0], sentences=sentences, auto_suggest=False)
|
195
|
+
raise
|
196
|
+
|
197
|
+
def get_page(self, title: str) -> Dict[str, Any]:
|
198
|
+
"""
|
199
|
+
Get detailed information about a specific Wikipedia page.
|
200
|
+
|
201
|
+
Args:
|
202
|
+
title: Title of the Wikipedia page
|
203
|
+
|
204
|
+
Returns:
|
205
|
+
Dictionary with page information
|
206
|
+
"""
|
207
|
+
# Check if we should include full content
|
208
|
+
include_content = not (hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY)
|
209
|
+
|
210
|
+
try:
|
211
|
+
page = wikipedia.page(title, auto_suggest=False)
|
212
|
+
|
213
|
+
result = {
|
214
|
+
"title": page.title,
|
215
|
+
"link": page.url,
|
216
|
+
"snippet": self.get_summary(title, self.sentences)
|
217
|
+
}
|
218
|
+
|
219
|
+
# Add additional information if requested
|
220
|
+
if include_content:
|
221
|
+
result["content"] = page.content
|
222
|
+
result["full_content"] = page.content
|
223
|
+
result["categories"] = page.categories
|
224
|
+
result["references"] = page.references
|
225
|
+
result["links"] = page.links
|
226
|
+
result["images"] = page.images
|
227
|
+
result["sections"] = page.sections
|
228
|
+
|
229
|
+
return result
|
230
|
+
except wikipedia.exceptions.DisambiguationError as e:
|
231
|
+
if e.options and len(e.options) > 0:
|
232
|
+
return self.get_page(e.options[0])
|
233
|
+
raise
|
234
|
+
|
235
|
+
def set_language(self, language: str) -> None:
|
236
|
+
"""
|
237
|
+
Change the Wikipedia language.
|
238
|
+
|
239
|
+
Args:
|
240
|
+
language: Language code (e.g., 'en', 'fr', 'es')
|
241
|
+
"""
|
242
|
+
wikipedia.set_lang(language)
|
@@ -0,0 +1,254 @@
|
|
1
|
+
import justext
|
2
|
+
from langchain_community.document_loaders import AsyncChromiumLoader
|
3
|
+
from langchain_community.document_transformers import BeautifulSoupTransformer
|
4
|
+
from langchain_core.language_models import BaseLLM
|
5
|
+
from typing import List, Dict, Any, Optional, Union
|
6
|
+
import json
|
7
|
+
import os
|
8
|
+
from .utilties.search_utilities import remove_think_tags
|
9
|
+
from datetime import datetime
|
10
|
+
from local_deep_research import config
|
11
|
+
|
12
|
+
class FullSearchResults:
|
13
|
+
"""
|
14
|
+
Enhanced web content retrieval class that works with the BaseSearchEngine architecture.
|
15
|
+
Can be used as a wrapper around web-based search engines like DuckDuckGo and SerpAPI.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
llm: BaseLLM,
|
21
|
+
web_search,
|
22
|
+
output_format: str = "list",
|
23
|
+
language: str = "English",
|
24
|
+
max_results: int = 10,
|
25
|
+
region: str = "wt-wt",
|
26
|
+
time: str = "y",
|
27
|
+
safesearch: str = "Moderate"
|
28
|
+
):
|
29
|
+
"""
|
30
|
+
Initialize the full search results processor.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
llm: Language model instance for relevance filtering
|
34
|
+
web_search: Web search engine instance that provides initial results
|
35
|
+
output_format: Format of output ('list' or other formats)
|
36
|
+
language: Language for content processing
|
37
|
+
max_results: Maximum number of search results
|
38
|
+
region: Search region
|
39
|
+
time: Time period for search results
|
40
|
+
safesearch: Safe search setting
|
41
|
+
"""
|
42
|
+
self.llm = llm
|
43
|
+
self.output_format = output_format
|
44
|
+
self.language = language
|
45
|
+
self.max_results = max_results
|
46
|
+
self.region = region
|
47
|
+
self.time = time
|
48
|
+
self.safesearch = safesearch
|
49
|
+
self.web_search = web_search
|
50
|
+
os.environ["USER_AGENT"] = "Local Deep Research/1.0"
|
51
|
+
|
52
|
+
self.bs_transformer = BeautifulSoupTransformer()
|
53
|
+
self.tags_to_extract = ["p", "div", "span"]
|
54
|
+
|
55
|
+
def run(self, query: str) -> List[Dict[str, Any]]:
|
56
|
+
"""
|
57
|
+
Legacy method that performs a full search in one step.
|
58
|
+
Respects config parameters:
|
59
|
+
- SEARCH_SNIPPETS_ONLY: If True, only returns snippets without full content
|
60
|
+
- SKIP_RELEVANCE_FILTER: If True, returns all results without filtering
|
61
|
+
|
62
|
+
Args:
|
63
|
+
query: The search query
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
List of search results with full content (unless SEARCH_SNIPPETS_ONLY is True)
|
67
|
+
"""
|
68
|
+
# Phase 1: Get search results from the web search engine
|
69
|
+
previews = self._get_previews(query)
|
70
|
+
if not previews:
|
71
|
+
return []
|
72
|
+
|
73
|
+
# Phase 2: Filter URLs using LLM (unless SKIP_RELEVANCE_FILTER is True)
|
74
|
+
if hasattr(config, 'SKIP_RELEVANCE_FILTER') and config.SKIP_RELEVANCE_FILTER:
|
75
|
+
relevant_items = previews
|
76
|
+
print("Skipping relevance filtering as per config")
|
77
|
+
else:
|
78
|
+
relevant_items = self._filter_relevant_items(previews, query)
|
79
|
+
if not relevant_items:
|
80
|
+
return []
|
81
|
+
|
82
|
+
# Phase 3: Get full content for relevant items (unless SEARCH_SNIPPETS_ONLY is True)
|
83
|
+
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
84
|
+
print("Returning snippet-only results as per config")
|
85
|
+
return relevant_items
|
86
|
+
else:
|
87
|
+
results = self._get_full_content(relevant_items)
|
88
|
+
return results
|
89
|
+
|
90
|
+
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
91
|
+
"""
|
92
|
+
Get preview information from the web search engine.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
query: The search query
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
List of preview dictionaries
|
99
|
+
"""
|
100
|
+
try:
|
101
|
+
# Get search results from the web search engine
|
102
|
+
search_results = self.web_search.invoke(query)
|
103
|
+
|
104
|
+
if not isinstance(search_results, list):
|
105
|
+
print("Error: Expected search results in list format")
|
106
|
+
return []
|
107
|
+
|
108
|
+
# Return the results as previews
|
109
|
+
return search_results
|
110
|
+
|
111
|
+
except Exception as e:
|
112
|
+
print(f"Error getting previews: {e}")
|
113
|
+
return []
|
114
|
+
|
115
|
+
def _filter_relevant_items(self, previews: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
|
116
|
+
"""
|
117
|
+
Filter previews for relevance using LLM.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
previews: List of preview dictionaries
|
121
|
+
query: The original search query
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
List of relevant preview dictionaries
|
125
|
+
"""
|
126
|
+
# Skip filtering if disabled in config or no previews
|
127
|
+
if not config.QUALITY_CHECK_DDG_URLS or not previews:
|
128
|
+
return previews
|
129
|
+
|
130
|
+
# Format for LLM evaluation
|
131
|
+
now = datetime.now()
|
132
|
+
current_time = now.strftime("%Y-%m-%d")
|
133
|
+
prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for:
|
134
|
+
1. Timeliness (today: {current_time})
|
135
|
+
2. Factual accuracy (cross-reference major claims)
|
136
|
+
3. Source reliability (prefer official company websites, established news outlets)
|
137
|
+
4. Direct relevance to query: {query}
|
138
|
+
|
139
|
+
URLs to evaluate:
|
140
|
+
{json.dumps(previews, indent=2)}
|
141
|
+
|
142
|
+
Return a JSON array of indices (0-based) for sources that meet ALL criteria.
|
143
|
+
ONLY Return a JSON array of indices (0-based) and nothing else. No letters.
|
144
|
+
Example response: \n[0, 2, 4]\n\n"""
|
145
|
+
|
146
|
+
try:
|
147
|
+
# Get LLM's evaluation
|
148
|
+
response = self.llm.invoke(prompt)
|
149
|
+
|
150
|
+
# Extract JSON array from response
|
151
|
+
response_text = remove_think_tags(response.content)
|
152
|
+
# Clean up response to handle potential formatting issues
|
153
|
+
response_text = response_text.strip()
|
154
|
+
|
155
|
+
# Find the first occurrence of '[' and the last occurrence of ']'
|
156
|
+
start_idx = response_text.find('[')
|
157
|
+
end_idx = response_text.rfind(']')
|
158
|
+
|
159
|
+
if start_idx >= 0 and end_idx > start_idx:
|
160
|
+
array_text = response_text[start_idx:end_idx+1]
|
161
|
+
good_indices = json.loads(array_text)
|
162
|
+
|
163
|
+
# Return only the results with good indices
|
164
|
+
return [r for i, r in enumerate(previews) if i in good_indices]
|
165
|
+
else:
|
166
|
+
print("Could not find JSON array in response, returning all previews")
|
167
|
+
return previews
|
168
|
+
|
169
|
+
except Exception as e:
|
170
|
+
print(f"URL filtering error: {e}")
|
171
|
+
# Fall back to returning all previews on error
|
172
|
+
return previews
|
173
|
+
|
174
|
+
def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
175
|
+
"""
|
176
|
+
Get full content for the relevant items by retrieving and processing web pages.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
relevant_items: List of relevant preview dictionaries
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
List of result dictionaries with full content
|
183
|
+
"""
|
184
|
+
nr_full_text = 0
|
185
|
+
|
186
|
+
# Extract URLs from relevant items
|
187
|
+
urls = [item.get("link") for item in relevant_items if item.get("link")]
|
188
|
+
|
189
|
+
if not urls:
|
190
|
+
print("\n === NO VALID LINKS ===\n")
|
191
|
+
return relevant_items
|
192
|
+
|
193
|
+
try:
|
194
|
+
# Download the full HTML pages for filtered URLs
|
195
|
+
loader = AsyncChromiumLoader(urls)
|
196
|
+
html_docs = loader.load()
|
197
|
+
|
198
|
+
# Process the HTML using BeautifulSoupTransformer
|
199
|
+
full_docs = self.bs_transformer.transform_documents(
|
200
|
+
html_docs, tags_to_extract=self.tags_to_extract
|
201
|
+
)
|
202
|
+
|
203
|
+
# Remove boilerplate from each document
|
204
|
+
url_to_content = {}
|
205
|
+
for doc in full_docs:
|
206
|
+
nr_full_text += 1
|
207
|
+
source = doc.metadata.get("source")
|
208
|
+
if source:
|
209
|
+
cleaned_text = self._remove_boilerplate(doc.page_content)
|
210
|
+
url_to_content[source] = cleaned_text
|
211
|
+
|
212
|
+
# Attach the cleaned full content to each result
|
213
|
+
results = []
|
214
|
+
for item in relevant_items:
|
215
|
+
new_item = item.copy()
|
216
|
+
link = item.get("link")
|
217
|
+
new_item["full_content"] = url_to_content.get(link, None)
|
218
|
+
results.append(new_item)
|
219
|
+
|
220
|
+
print(f"FULL SEARCH WITH FILTERED URLS - Full text retrieved: {nr_full_text}")
|
221
|
+
return results
|
222
|
+
|
223
|
+
except Exception as e:
|
224
|
+
print(f"Error retrieving full content: {e}")
|
225
|
+
# Return original items if full content retrieval fails
|
226
|
+
return relevant_items
|
227
|
+
|
228
|
+
def _remove_boilerplate(self, html: str) -> str:
|
229
|
+
"""
|
230
|
+
Remove boilerplate content from HTML.
|
231
|
+
|
232
|
+
Args:
|
233
|
+
html: HTML content
|
234
|
+
|
235
|
+
Returns:
|
236
|
+
Cleaned text content
|
237
|
+
"""
|
238
|
+
if not html or not html.strip():
|
239
|
+
return ""
|
240
|
+
try:
|
241
|
+
paragraphs = justext.justext(html, justext.get_stoplist(self.language))
|
242
|
+
cleaned = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
|
243
|
+
return cleaned
|
244
|
+
except Exception as e:
|
245
|
+
print(f"Error removing boilerplate: {e}")
|
246
|
+
return html
|
247
|
+
|
248
|
+
def invoke(self, query: str) -> List[Dict[str, Any]]:
|
249
|
+
"""Compatibility method for LangChain tools"""
|
250
|
+
return self.run(query)
|
251
|
+
|
252
|
+
def __call__(self, query: str) -> List[Dict[str, Any]]:
|
253
|
+
"""Make the class callable like a function"""
|
254
|
+
return self.invoke(query)
|
@@ -0,0 +1,197 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Dict, List, Any, Optional
|
3
|
+
from langchain_core.language_models import BaseLLM
|
4
|
+
from datetime import datetime
|
5
|
+
import json
|
6
|
+
from local_deep_research.utilties.search_utilities import remove_think_tags
|
7
|
+
|
8
|
+
import logging
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
class BaseSearchEngine(ABC):
|
12
|
+
"""
|
13
|
+
Abstract base class for search engines with two-phase retrieval capability.
|
14
|
+
Handles common parameters and implements the two-phase search approach.
|
15
|
+
"""
|
16
|
+
|
17
|
+
def __init__(self,
|
18
|
+
llm: Optional[BaseLLM] = None,
|
19
|
+
max_filtered_results: Optional[int] = 5,
|
20
|
+
**kwargs):
|
21
|
+
"""
|
22
|
+
Initialize the search engine with common parameters.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
llm: Optional language model for relevance filtering
|
26
|
+
max_filtered_results: Maximum number of results to keep after filtering
|
27
|
+
**kwargs: Additional engine-specific parameters
|
28
|
+
"""
|
29
|
+
if max_filtered_results == None: max_filtered_results=5
|
30
|
+
self.llm = llm # LLM for relevance filtering
|
31
|
+
self.max_filtered_results = max_filtered_results # Limit filtered results
|
32
|
+
|
33
|
+
def run(self, query: str) -> List[Dict[str, Any]]:
|
34
|
+
"""
|
35
|
+
Run the search engine with a given query, retrieving and filtering results.
|
36
|
+
This implements a two-phase retrieval approach:
|
37
|
+
1. Get preview information for many results
|
38
|
+
2. Filter the previews for relevance
|
39
|
+
3. Get full content for only the relevant results
|
40
|
+
|
41
|
+
Args:
|
42
|
+
query: The search query
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
List of search results with full content (if available)
|
46
|
+
"""
|
47
|
+
# Ensure we're measuring time correctly for citation tracking
|
48
|
+
start_time = datetime.now()
|
49
|
+
|
50
|
+
# Step 1: Get preview information for items
|
51
|
+
previews = self._get_previews(query)
|
52
|
+
if not previews:
|
53
|
+
logger.info(f"Search engine {self.__class__.__name__} returned no preview results for query: {query}")
|
54
|
+
return []
|
55
|
+
|
56
|
+
# Step 2: Filter previews for relevance with LLM
|
57
|
+
filtered_items = self._filter_for_relevance(previews, query)
|
58
|
+
if not filtered_items:
|
59
|
+
logger.info(f"All preview results were filtered out as irrelevant for query: {query}")
|
60
|
+
# Fall back to preview items if everything was filtered
|
61
|
+
# Access config inside the method to avoid circular import
|
62
|
+
from local_deep_research import config
|
63
|
+
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
64
|
+
return previews[:self.max_filtered_results or 5] # Return unfiltered but limited results
|
65
|
+
else:
|
66
|
+
filtered_items = previews[:self.max_filtered_results or 5]
|
67
|
+
|
68
|
+
# Step 3: Get full content for filtered items
|
69
|
+
# Import config inside the method to avoid circular import
|
70
|
+
from local_deep_research import config
|
71
|
+
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
72
|
+
logger.info("Returning snippet-only results as per config")
|
73
|
+
results = filtered_items
|
74
|
+
else:
|
75
|
+
results = self._get_full_content(filtered_items)
|
76
|
+
|
77
|
+
return results
|
78
|
+
|
79
|
+
def invoke(self, query: str) -> List[Dict[str, Any]]:
|
80
|
+
"""Compatibility method for LangChain tools"""
|
81
|
+
return self.run(query)
|
82
|
+
|
83
|
+
def _filter_for_relevance(self, previews: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
|
84
|
+
"""
|
85
|
+
Filter search results for relevance to the query using an LLM.
|
86
|
+
|
87
|
+
Checks config.SKIP_RELEVANCE_FILTER to determine whether to perform filtering.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
previews: List of search result dictionaries with preview information
|
91
|
+
query: The original search query
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
Filtered list of the most relevant search results
|
95
|
+
"""
|
96
|
+
# Import config inside the method to avoid circular import
|
97
|
+
from local_deep_research import config
|
98
|
+
|
99
|
+
# Skip filtering if configured to do so or if no LLM is available
|
100
|
+
if hasattr(config, 'SKIP_RELEVANCE_FILTER') and config.SKIP_RELEVANCE_FILTER:
|
101
|
+
# Return all previews up to max_filtered_results if no filtering is performed
|
102
|
+
limit = self.max_filtered_results or 5
|
103
|
+
return previews[:limit]
|
104
|
+
|
105
|
+
# Default implementation uses LLM if available
|
106
|
+
if not self.llm or not previews:
|
107
|
+
# If no LLM available, return all previews as relevant
|
108
|
+
if self.max_filtered_results and len(previews) > self.max_filtered_results:
|
109
|
+
return previews[:self.max_filtered_results]
|
110
|
+
return previews
|
111
|
+
|
112
|
+
now = datetime.now()
|
113
|
+
current_time = now.strftime("%Y-%m-%d")
|
114
|
+
prompt = f"""Analyze these search results and provide a ranked list of the most relevant ones.
|
115
|
+
|
116
|
+
IMPORTANT: Evaluate and rank based on these criteria (in order of importance):
|
117
|
+
1. Timeliness - current/recent information as of {current_time}
|
118
|
+
2. Direct relevance to query: "{query}"
|
119
|
+
3. Source reliability (prefer official sources, established websites)
|
120
|
+
4. Factual accuracy (cross-reference major claims)
|
121
|
+
|
122
|
+
Search results to evaluate:
|
123
|
+
{json.dumps(previews, indent=2)}
|
124
|
+
|
125
|
+
Return ONLY a JSON array of indices (0-based) ranked from most to least relevant.
|
126
|
+
Include ONLY indices that meet ALL criteria, with the most relevant first.
|
127
|
+
Example response: [4, 0, 2]
|
128
|
+
|
129
|
+
Respond with ONLY the JSON array, no other text."""
|
130
|
+
|
131
|
+
try:
|
132
|
+
# Get LLM's evaluation
|
133
|
+
response = self.llm.invoke(prompt)
|
134
|
+
|
135
|
+
# Extract JSON array from response
|
136
|
+
response_text = remove_think_tags(response.content)
|
137
|
+
# Clean up response to handle potential formatting issues
|
138
|
+
response_text = response_text.strip()
|
139
|
+
|
140
|
+
# Find the first occurrence of '[' and the last occurrence of ']'
|
141
|
+
start_idx = response_text.find('[')
|
142
|
+
end_idx = response_text.rfind(']')
|
143
|
+
|
144
|
+
if start_idx >= 0 and end_idx > start_idx:
|
145
|
+
array_text = response_text[start_idx:end_idx+1]
|
146
|
+
ranked_indices = json.loads(array_text)
|
147
|
+
|
148
|
+
# Return the results in ranked order
|
149
|
+
ranked_results = []
|
150
|
+
for idx in ranked_indices:
|
151
|
+
if idx < len(previews):
|
152
|
+
ranked_results.append(previews[idx])
|
153
|
+
|
154
|
+
# Limit to max_filtered_results if specified
|
155
|
+
if self.max_filtered_results and len(ranked_results) > self.max_filtered_results:
|
156
|
+
logger.info(f"Limiting filtered results to top {self.max_filtered_results}")
|
157
|
+
return ranked_results[:self.max_filtered_results]
|
158
|
+
|
159
|
+
return ranked_results
|
160
|
+
else:
|
161
|
+
logger.info("Could not find JSON array in response, returning all previews")
|
162
|
+
if self.max_filtered_results and len(previews) > self.max_filtered_results:
|
163
|
+
return previews[:self.max_filtered_results]
|
164
|
+
return previews
|
165
|
+
|
166
|
+
except Exception as e:
|
167
|
+
logger.info(f"Relevance filtering error: {e}")
|
168
|
+
# Fall back to returning all previews (or top N) on error
|
169
|
+
if self.max_filtered_results and len(previews) > self.max_filtered_results:
|
170
|
+
return previews[:self.max_filtered_results]
|
171
|
+
return previews
|
172
|
+
|
173
|
+
@abstractmethod
|
174
|
+
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
175
|
+
"""
|
176
|
+
Get preview information (titles, summaries) for initial search results.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
query: The search query
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
List of preview dictionaries with at least 'id', 'title', and 'snippet' keys
|
183
|
+
"""
|
184
|
+
pass
|
185
|
+
|
186
|
+
@abstractmethod
|
187
|
+
def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
188
|
+
"""
|
189
|
+
Get full content for the relevant items.
|
190
|
+
|
191
|
+
Args:
|
192
|
+
relevant_items: List of relevant preview dictionaries
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
List of result dictionaries with full content
|
196
|
+
"""
|
197
|
+
pass
|