local-deep-research 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. local_deep_research/__init__.py +24 -0
  2. local_deep_research/citation_handler.py +113 -0
  3. local_deep_research/config.py +166 -0
  4. local_deep_research/defaults/__init__.py +44 -0
  5. local_deep_research/defaults/llm_config.py +269 -0
  6. local_deep_research/defaults/local_collections.toml +47 -0
  7. local_deep_research/defaults/main.toml +57 -0
  8. local_deep_research/defaults/search_engines.toml +244 -0
  9. local_deep_research/local_collections.py +141 -0
  10. local_deep_research/main.py +113 -0
  11. local_deep_research/report_generator.py +206 -0
  12. local_deep_research/search_system.py +241 -0
  13. local_deep_research/utilties/__init__.py +0 -0
  14. local_deep_research/utilties/enums.py +9 -0
  15. local_deep_research/utilties/llm_utils.py +116 -0
  16. local_deep_research/utilties/search_utilities.py +115 -0
  17. local_deep_research/utilties/setup_utils.py +6 -0
  18. local_deep_research/web/__init__.py +2 -0
  19. local_deep_research/web/app.py +1209 -0
  20. local_deep_research/web/static/css/styles.css +1008 -0
  21. local_deep_research/web/static/js/app.js +2078 -0
  22. local_deep_research/web/templates/api_keys_config.html +82 -0
  23. local_deep_research/web/templates/collections_config.html +90 -0
  24. local_deep_research/web/templates/index.html +312 -0
  25. local_deep_research/web/templates/llm_config.html +120 -0
  26. local_deep_research/web/templates/main_config.html +89 -0
  27. local_deep_research/web/templates/search_engines_config.html +154 -0
  28. local_deep_research/web/templates/settings.html +519 -0
  29. local_deep_research/web/templates/settings_dashboard.html +207 -0
  30. local_deep_research/web_search_engines/__init__.py +0 -0
  31. local_deep_research/web_search_engines/engines/__init__.py +0 -0
  32. local_deep_research/web_search_engines/engines/full_search.py +128 -0
  33. local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
  34. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
  35. local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
  36. local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
  37. local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
  38. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
  39. local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
  40. local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
  41. local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
  42. local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
  43. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
  44. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
  45. local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
  46. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
  47. local_deep_research/web_search_engines/full_search.py +254 -0
  48. local_deep_research/web_search_engines/search_engine_base.py +197 -0
  49. local_deep_research/web_search_engines/search_engine_factory.py +233 -0
  50. local_deep_research/web_search_engines/search_engines_config.py +54 -0
  51. local_deep_research-0.1.0.dist-info/LICENSE +21 -0
  52. local_deep_research-0.1.0.dist-info/METADATA +328 -0
  53. local_deep_research-0.1.0.dist-info/RECORD +56 -0
  54. local_deep_research-0.1.0.dist-info/WHEEL +5 -0
  55. local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
  56. local_deep_research-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,242 @@
1
+ import wikipedia
2
+ from typing import Dict, List, Any, Optional
3
+ from langchain_core.language_models import BaseLLM
4
+ import logging
5
+
6
+ from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
7
+ from local_deep_research import config
8
+
9
+ # Setup logging
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class WikipediaSearchEngine(BaseSearchEngine):
14
+ """Wikipedia search engine implementation with two-phase approach"""
15
+
16
+ def __init__(self,
17
+ max_results: int = 10,
18
+ language: str = "en",
19
+ include_content: bool = True,
20
+ sentences: int = 5,
21
+ llm: Optional[BaseLLM] = None,
22
+ max_filtered_results: Optional[int] = None,
23
+ **kwargs):
24
+ """
25
+ Initialize the Wikipedia search engine.
26
+
27
+ Args:
28
+ max_results: Maximum number of search results
29
+ language: Language code for Wikipedia (e.g., 'en', 'fr', 'es')
30
+ include_content: Whether to include full page content in results
31
+ sentences: Number of sentences to include in summary
32
+ llm: Language model for relevance filtering
33
+ max_filtered_results: Maximum number of results to keep after filtering
34
+ **kwargs: Additional parameters (ignored but accepted for compatibility)
35
+ """
36
+ # Initialize the BaseSearchEngine with the LLM and max_filtered_results
37
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results)
38
+
39
+ self.max_results = max_results
40
+ self.include_content = include_content
41
+ self.sentences = sentences
42
+
43
+ # Set the Wikipedia language
44
+ wikipedia.set_lang(language)
45
+
46
+ def _get_previews(self, query: str) -> List[Dict[str, Any]]:
47
+ """
48
+ Get preview information (titles and summaries) for Wikipedia pages.
49
+
50
+ Args:
51
+ query: The search query
52
+
53
+ Returns:
54
+ List of preview dictionaries
55
+ """
56
+ logger.info(f"Getting Wikipedia page previews for query: {query}")
57
+
58
+ try:
59
+ # Get search results (just titles)
60
+ search_results = wikipedia.search(query, results=self.max_results)
61
+
62
+ logger.info(f"Found {len(search_results)} Wikipedia results: {search_results}")
63
+
64
+ if not search_results:
65
+ logger.info(f"No Wikipedia results found for query: {query}")
66
+ return []
67
+
68
+ # Create a cache for full pages (will be populated on-demand)
69
+ self._page_cache = {}
70
+
71
+ # Generate previews with summaries
72
+ previews = []
73
+ for title in search_results:
74
+ try:
75
+ # Get just the summary, with auto_suggest=False to be more precise
76
+ summary = None
77
+ try:
78
+ summary = wikipedia.summary(title, sentences=self.sentences, auto_suggest=False)
79
+ except wikipedia.exceptions.DisambiguationError as e:
80
+ # If disambiguation error, try the first option
81
+ if e.options and len(e.options) > 0:
82
+ logger.info(f"Disambiguation for '{title}', trying first option: {e.options[0]}")
83
+ try:
84
+ summary = wikipedia.summary(e.options[0], sentences=self.sentences, auto_suggest=False)
85
+ title = e.options[0] # Use the new title
86
+ except Exception as inner_e:
87
+ logger.error(f"Error with disambiguation option: {inner_e}")
88
+ continue
89
+ else:
90
+ logger.warning(f"Disambiguation with no options for '{title}'")
91
+ continue
92
+
93
+ if summary:
94
+ preview = {
95
+ "id": title, # Use title as ID
96
+ "title": title,
97
+ "snippet": summary,
98
+ "link": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
99
+ }
100
+
101
+ previews.append(preview)
102
+
103
+ except (wikipedia.exceptions.PageError,
104
+ wikipedia.exceptions.WikipediaException) as e:
105
+ # Skip pages with errors
106
+ logger.warning(f"Error getting summary for '{title}': {e}")
107
+ continue
108
+ except Exception as e:
109
+ logger.error(f"Unexpected error for '{title}': {e}")
110
+ continue
111
+
112
+ logger.info(f"Successfully created {len(previews)} previews from Wikipedia")
113
+ return previews
114
+
115
+ except Exception as e:
116
+ logger.error(f"Error getting Wikipedia previews: {e}")
117
+ return []
118
+
119
+ def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
120
+ """
121
+ Get full content for the relevant Wikipedia pages.
122
+
123
+ Args:
124
+ relevant_items: List of relevant preview dictionaries
125
+
126
+ Returns:
127
+ List of result dictionaries with full content
128
+ """
129
+ # Check if we should add full content
130
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
131
+ logger.info("Snippet-only mode, skipping full content retrieval")
132
+ return relevant_items
133
+
134
+ logger.info(f"Getting full content for {len(relevant_items)} relevant Wikipedia pages")
135
+
136
+ results = []
137
+ for item in relevant_items:
138
+ title = item.get("id") # Title stored as ID
139
+
140
+ if not title:
141
+ results.append(item)
142
+ continue
143
+
144
+ try:
145
+ # Get the full page
146
+ page = wikipedia.page(title, auto_suggest=False)
147
+
148
+ # Create a full result with all information
149
+ result = {
150
+ "title": page.title,
151
+ "link": page.url,
152
+ "snippet": item.get("snippet", "") # Keep existing snippet
153
+ }
154
+
155
+ # Add additional information
156
+ result["content"] = page.content
157
+ result["full_content"] = page.content
158
+ result["categories"] = page.categories
159
+ result["references"] = page.references
160
+ result["links"] = page.links
161
+ result["images"] = page.images
162
+ result["sections"] = page.sections
163
+
164
+ results.append(result)
165
+
166
+ except (wikipedia.exceptions.DisambiguationError,
167
+ wikipedia.exceptions.PageError,
168
+ wikipedia.exceptions.WikipediaException) as e:
169
+ # If error, use the preview
170
+ logger.warning(f"Error getting full content for '{title}': {e}")
171
+ results.append(item)
172
+ except Exception as e:
173
+ logger.error(f"Unexpected error getting full content for '{title}': {e}")
174
+ results.append(item)
175
+
176
+ return results
177
+
178
+ def get_summary(self, title: str, sentences: Optional[int] = None) -> str:
179
+ """
180
+ Get a summary of a specific Wikipedia page.
181
+
182
+ Args:
183
+ title: Title of the Wikipedia page
184
+ sentences: Number of sentences to include (defaults to self.sentences)
185
+
186
+ Returns:
187
+ Summary of the page
188
+ """
189
+ sentences = sentences or self.sentences
190
+ try:
191
+ return wikipedia.summary(title, sentences=sentences, auto_suggest=False)
192
+ except wikipedia.exceptions.DisambiguationError as e:
193
+ if e.options and len(e.options) > 0:
194
+ return wikipedia.summary(e.options[0], sentences=sentences, auto_suggest=False)
195
+ raise
196
+
197
+ def get_page(self, title: str) -> Dict[str, Any]:
198
+ """
199
+ Get detailed information about a specific Wikipedia page.
200
+
201
+ Args:
202
+ title: Title of the Wikipedia page
203
+
204
+ Returns:
205
+ Dictionary with page information
206
+ """
207
+ # Check if we should include full content
208
+ include_content = not (hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY)
209
+
210
+ try:
211
+ page = wikipedia.page(title, auto_suggest=False)
212
+
213
+ result = {
214
+ "title": page.title,
215
+ "link": page.url,
216
+ "snippet": self.get_summary(title, self.sentences)
217
+ }
218
+
219
+ # Add additional information if requested
220
+ if include_content:
221
+ result["content"] = page.content
222
+ result["full_content"] = page.content
223
+ result["categories"] = page.categories
224
+ result["references"] = page.references
225
+ result["links"] = page.links
226
+ result["images"] = page.images
227
+ result["sections"] = page.sections
228
+
229
+ return result
230
+ except wikipedia.exceptions.DisambiguationError as e:
231
+ if e.options and len(e.options) > 0:
232
+ return self.get_page(e.options[0])
233
+ raise
234
+
235
+ def set_language(self, language: str) -> None:
236
+ """
237
+ Change the Wikipedia language.
238
+
239
+ Args:
240
+ language: Language code (e.g., 'en', 'fr', 'es')
241
+ """
242
+ wikipedia.set_lang(language)
@@ -0,0 +1,254 @@
1
+ import justext
2
+ from langchain_community.document_loaders import AsyncChromiumLoader
3
+ from langchain_community.document_transformers import BeautifulSoupTransformer
4
+ from langchain_core.language_models import BaseLLM
5
+ from typing import List, Dict, Any, Optional, Union
6
+ import json
7
+ import os
8
+ from .utilties.search_utilities import remove_think_tags
9
+ from datetime import datetime
10
+ from local_deep_research import config
11
+
12
+ class FullSearchResults:
13
+ """
14
+ Enhanced web content retrieval class that works with the BaseSearchEngine architecture.
15
+ Can be used as a wrapper around web-based search engines like DuckDuckGo and SerpAPI.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ llm: BaseLLM,
21
+ web_search,
22
+ output_format: str = "list",
23
+ language: str = "English",
24
+ max_results: int = 10,
25
+ region: str = "wt-wt",
26
+ time: str = "y",
27
+ safesearch: str = "Moderate"
28
+ ):
29
+ """
30
+ Initialize the full search results processor.
31
+
32
+ Args:
33
+ llm: Language model instance for relevance filtering
34
+ web_search: Web search engine instance that provides initial results
35
+ output_format: Format of output ('list' or other formats)
36
+ language: Language for content processing
37
+ max_results: Maximum number of search results
38
+ region: Search region
39
+ time: Time period for search results
40
+ safesearch: Safe search setting
41
+ """
42
+ self.llm = llm
43
+ self.output_format = output_format
44
+ self.language = language
45
+ self.max_results = max_results
46
+ self.region = region
47
+ self.time = time
48
+ self.safesearch = safesearch
49
+ self.web_search = web_search
50
+ os.environ["USER_AGENT"] = "Local Deep Research/1.0"
51
+
52
+ self.bs_transformer = BeautifulSoupTransformer()
53
+ self.tags_to_extract = ["p", "div", "span"]
54
+
55
+ def run(self, query: str) -> List[Dict[str, Any]]:
56
+ """
57
+ Legacy method that performs a full search in one step.
58
+ Respects config parameters:
59
+ - SEARCH_SNIPPETS_ONLY: If True, only returns snippets without full content
60
+ - SKIP_RELEVANCE_FILTER: If True, returns all results without filtering
61
+
62
+ Args:
63
+ query: The search query
64
+
65
+ Returns:
66
+ List of search results with full content (unless SEARCH_SNIPPETS_ONLY is True)
67
+ """
68
+ # Phase 1: Get search results from the web search engine
69
+ previews = self._get_previews(query)
70
+ if not previews:
71
+ return []
72
+
73
+ # Phase 2: Filter URLs using LLM (unless SKIP_RELEVANCE_FILTER is True)
74
+ if hasattr(config, 'SKIP_RELEVANCE_FILTER') and config.SKIP_RELEVANCE_FILTER:
75
+ relevant_items = previews
76
+ print("Skipping relevance filtering as per config")
77
+ else:
78
+ relevant_items = self._filter_relevant_items(previews, query)
79
+ if not relevant_items:
80
+ return []
81
+
82
+ # Phase 3: Get full content for relevant items (unless SEARCH_SNIPPETS_ONLY is True)
83
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
84
+ print("Returning snippet-only results as per config")
85
+ return relevant_items
86
+ else:
87
+ results = self._get_full_content(relevant_items)
88
+ return results
89
+
90
+ def _get_previews(self, query: str) -> List[Dict[str, Any]]:
91
+ """
92
+ Get preview information from the web search engine.
93
+
94
+ Args:
95
+ query: The search query
96
+
97
+ Returns:
98
+ List of preview dictionaries
99
+ """
100
+ try:
101
+ # Get search results from the web search engine
102
+ search_results = self.web_search.invoke(query)
103
+
104
+ if not isinstance(search_results, list):
105
+ print("Error: Expected search results in list format")
106
+ return []
107
+
108
+ # Return the results as previews
109
+ return search_results
110
+
111
+ except Exception as e:
112
+ print(f"Error getting previews: {e}")
113
+ return []
114
+
115
+ def _filter_relevant_items(self, previews: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
116
+ """
117
+ Filter previews for relevance using LLM.
118
+
119
+ Args:
120
+ previews: List of preview dictionaries
121
+ query: The original search query
122
+
123
+ Returns:
124
+ List of relevant preview dictionaries
125
+ """
126
+ # Skip filtering if disabled in config or no previews
127
+ if not config.QUALITY_CHECK_DDG_URLS or not previews:
128
+ return previews
129
+
130
+ # Format for LLM evaluation
131
+ now = datetime.now()
132
+ current_time = now.strftime("%Y-%m-%d")
133
+ prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for:
134
+ 1. Timeliness (today: {current_time})
135
+ 2. Factual accuracy (cross-reference major claims)
136
+ 3. Source reliability (prefer official company websites, established news outlets)
137
+ 4. Direct relevance to query: {query}
138
+
139
+ URLs to evaluate:
140
+ {json.dumps(previews, indent=2)}
141
+
142
+ Return a JSON array of indices (0-based) for sources that meet ALL criteria.
143
+ ONLY Return a JSON array of indices (0-based) and nothing else. No letters.
144
+ Example response: \n[0, 2, 4]\n\n"""
145
+
146
+ try:
147
+ # Get LLM's evaluation
148
+ response = self.llm.invoke(prompt)
149
+
150
+ # Extract JSON array from response
151
+ response_text = remove_think_tags(response.content)
152
+ # Clean up response to handle potential formatting issues
153
+ response_text = response_text.strip()
154
+
155
+ # Find the first occurrence of '[' and the last occurrence of ']'
156
+ start_idx = response_text.find('[')
157
+ end_idx = response_text.rfind(']')
158
+
159
+ if start_idx >= 0 and end_idx > start_idx:
160
+ array_text = response_text[start_idx:end_idx+1]
161
+ good_indices = json.loads(array_text)
162
+
163
+ # Return only the results with good indices
164
+ return [r for i, r in enumerate(previews) if i in good_indices]
165
+ else:
166
+ print("Could not find JSON array in response, returning all previews")
167
+ return previews
168
+
169
+ except Exception as e:
170
+ print(f"URL filtering error: {e}")
171
+ # Fall back to returning all previews on error
172
+ return previews
173
+
174
+ def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
175
+ """
176
+ Get full content for the relevant items by retrieving and processing web pages.
177
+
178
+ Args:
179
+ relevant_items: List of relevant preview dictionaries
180
+
181
+ Returns:
182
+ List of result dictionaries with full content
183
+ """
184
+ nr_full_text = 0
185
+
186
+ # Extract URLs from relevant items
187
+ urls = [item.get("link") for item in relevant_items if item.get("link")]
188
+
189
+ if not urls:
190
+ print("\n === NO VALID LINKS ===\n")
191
+ return relevant_items
192
+
193
+ try:
194
+ # Download the full HTML pages for filtered URLs
195
+ loader = AsyncChromiumLoader(urls)
196
+ html_docs = loader.load()
197
+
198
+ # Process the HTML using BeautifulSoupTransformer
199
+ full_docs = self.bs_transformer.transform_documents(
200
+ html_docs, tags_to_extract=self.tags_to_extract
201
+ )
202
+
203
+ # Remove boilerplate from each document
204
+ url_to_content = {}
205
+ for doc in full_docs:
206
+ nr_full_text += 1
207
+ source = doc.metadata.get("source")
208
+ if source:
209
+ cleaned_text = self._remove_boilerplate(doc.page_content)
210
+ url_to_content[source] = cleaned_text
211
+
212
+ # Attach the cleaned full content to each result
213
+ results = []
214
+ for item in relevant_items:
215
+ new_item = item.copy()
216
+ link = item.get("link")
217
+ new_item["full_content"] = url_to_content.get(link, None)
218
+ results.append(new_item)
219
+
220
+ print(f"FULL SEARCH WITH FILTERED URLS - Full text retrieved: {nr_full_text}")
221
+ return results
222
+
223
+ except Exception as e:
224
+ print(f"Error retrieving full content: {e}")
225
+ # Return original items if full content retrieval fails
226
+ return relevant_items
227
+
228
+ def _remove_boilerplate(self, html: str) -> str:
229
+ """
230
+ Remove boilerplate content from HTML.
231
+
232
+ Args:
233
+ html: HTML content
234
+
235
+ Returns:
236
+ Cleaned text content
237
+ """
238
+ if not html or not html.strip():
239
+ return ""
240
+ try:
241
+ paragraphs = justext.justext(html, justext.get_stoplist(self.language))
242
+ cleaned = "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
243
+ return cleaned
244
+ except Exception as e:
245
+ print(f"Error removing boilerplate: {e}")
246
+ return html
247
+
248
+ def invoke(self, query: str) -> List[Dict[str, Any]]:
249
+ """Compatibility method for LangChain tools"""
250
+ return self.run(query)
251
+
252
+ def __call__(self, query: str) -> List[Dict[str, Any]]:
253
+ """Make the class callable like a function"""
254
+ return self.invoke(query)
@@ -0,0 +1,197 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, List, Any, Optional
3
+ from langchain_core.language_models import BaseLLM
4
+ from datetime import datetime
5
+ import json
6
+ from local_deep_research.utilties.search_utilities import remove_think_tags
7
+
8
+ import logging
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class BaseSearchEngine(ABC):
12
+ """
13
+ Abstract base class for search engines with two-phase retrieval capability.
14
+ Handles common parameters and implements the two-phase search approach.
15
+ """
16
+
17
+ def __init__(self,
18
+ llm: Optional[BaseLLM] = None,
19
+ max_filtered_results: Optional[int] = 5,
20
+ **kwargs):
21
+ """
22
+ Initialize the search engine with common parameters.
23
+
24
+ Args:
25
+ llm: Optional language model for relevance filtering
26
+ max_filtered_results: Maximum number of results to keep after filtering
27
+ **kwargs: Additional engine-specific parameters
28
+ """
29
+ if max_filtered_results == None: max_filtered_results=5
30
+ self.llm = llm # LLM for relevance filtering
31
+ self.max_filtered_results = max_filtered_results # Limit filtered results
32
+
33
+ def run(self, query: str) -> List[Dict[str, Any]]:
34
+ """
35
+ Run the search engine with a given query, retrieving and filtering results.
36
+ This implements a two-phase retrieval approach:
37
+ 1. Get preview information for many results
38
+ 2. Filter the previews for relevance
39
+ 3. Get full content for only the relevant results
40
+
41
+ Args:
42
+ query: The search query
43
+
44
+ Returns:
45
+ List of search results with full content (if available)
46
+ """
47
+ # Ensure we're measuring time correctly for citation tracking
48
+ start_time = datetime.now()
49
+
50
+ # Step 1: Get preview information for items
51
+ previews = self._get_previews(query)
52
+ if not previews:
53
+ logger.info(f"Search engine {self.__class__.__name__} returned no preview results for query: {query}")
54
+ return []
55
+
56
+ # Step 2: Filter previews for relevance with LLM
57
+ filtered_items = self._filter_for_relevance(previews, query)
58
+ if not filtered_items:
59
+ logger.info(f"All preview results were filtered out as irrelevant for query: {query}")
60
+ # Fall back to preview items if everything was filtered
61
+ # Access config inside the method to avoid circular import
62
+ from local_deep_research import config
63
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
64
+ return previews[:self.max_filtered_results or 5] # Return unfiltered but limited results
65
+ else:
66
+ filtered_items = previews[:self.max_filtered_results or 5]
67
+
68
+ # Step 3: Get full content for filtered items
69
+ # Import config inside the method to avoid circular import
70
+ from local_deep_research import config
71
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
72
+ logger.info("Returning snippet-only results as per config")
73
+ results = filtered_items
74
+ else:
75
+ results = self._get_full_content(filtered_items)
76
+
77
+ return results
78
+
79
+ def invoke(self, query: str) -> List[Dict[str, Any]]:
80
+ """Compatibility method for LangChain tools"""
81
+ return self.run(query)
82
+
83
+ def _filter_for_relevance(self, previews: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
84
+ """
85
+ Filter search results for relevance to the query using an LLM.
86
+
87
+ Checks config.SKIP_RELEVANCE_FILTER to determine whether to perform filtering.
88
+
89
+ Args:
90
+ previews: List of search result dictionaries with preview information
91
+ query: The original search query
92
+
93
+ Returns:
94
+ Filtered list of the most relevant search results
95
+ """
96
+ # Import config inside the method to avoid circular import
97
+ from local_deep_research import config
98
+
99
+ # Skip filtering if configured to do so or if no LLM is available
100
+ if hasattr(config, 'SKIP_RELEVANCE_FILTER') and config.SKIP_RELEVANCE_FILTER:
101
+ # Return all previews up to max_filtered_results if no filtering is performed
102
+ limit = self.max_filtered_results or 5
103
+ return previews[:limit]
104
+
105
+ # Default implementation uses LLM if available
106
+ if not self.llm or not previews:
107
+ # If no LLM available, return all previews as relevant
108
+ if self.max_filtered_results and len(previews) > self.max_filtered_results:
109
+ return previews[:self.max_filtered_results]
110
+ return previews
111
+
112
+ now = datetime.now()
113
+ current_time = now.strftime("%Y-%m-%d")
114
+ prompt = f"""Analyze these search results and provide a ranked list of the most relevant ones.
115
+
116
+ IMPORTANT: Evaluate and rank based on these criteria (in order of importance):
117
+ 1. Timeliness - current/recent information as of {current_time}
118
+ 2. Direct relevance to query: "{query}"
119
+ 3. Source reliability (prefer official sources, established websites)
120
+ 4. Factual accuracy (cross-reference major claims)
121
+
122
+ Search results to evaluate:
123
+ {json.dumps(previews, indent=2)}
124
+
125
+ Return ONLY a JSON array of indices (0-based) ranked from most to least relevant.
126
+ Include ONLY indices that meet ALL criteria, with the most relevant first.
127
+ Example response: [4, 0, 2]
128
+
129
+ Respond with ONLY the JSON array, no other text."""
130
+
131
+ try:
132
+ # Get LLM's evaluation
133
+ response = self.llm.invoke(prompt)
134
+
135
+ # Extract JSON array from response
136
+ response_text = remove_think_tags(response.content)
137
+ # Clean up response to handle potential formatting issues
138
+ response_text = response_text.strip()
139
+
140
+ # Find the first occurrence of '[' and the last occurrence of ']'
141
+ start_idx = response_text.find('[')
142
+ end_idx = response_text.rfind(']')
143
+
144
+ if start_idx >= 0 and end_idx > start_idx:
145
+ array_text = response_text[start_idx:end_idx+1]
146
+ ranked_indices = json.loads(array_text)
147
+
148
+ # Return the results in ranked order
149
+ ranked_results = []
150
+ for idx in ranked_indices:
151
+ if idx < len(previews):
152
+ ranked_results.append(previews[idx])
153
+
154
+ # Limit to max_filtered_results if specified
155
+ if self.max_filtered_results and len(ranked_results) > self.max_filtered_results:
156
+ logger.info(f"Limiting filtered results to top {self.max_filtered_results}")
157
+ return ranked_results[:self.max_filtered_results]
158
+
159
+ return ranked_results
160
+ else:
161
+ logger.info("Could not find JSON array in response, returning all previews")
162
+ if self.max_filtered_results and len(previews) > self.max_filtered_results:
163
+ return previews[:self.max_filtered_results]
164
+ return previews
165
+
166
+ except Exception as e:
167
+ logger.info(f"Relevance filtering error: {e}")
168
+ # Fall back to returning all previews (or top N) on error
169
+ if self.max_filtered_results and len(previews) > self.max_filtered_results:
170
+ return previews[:self.max_filtered_results]
171
+ return previews
172
+
173
+ @abstractmethod
174
+ def _get_previews(self, query: str) -> List[Dict[str, Any]]:
175
+ """
176
+ Get preview information (titles, summaries) for initial search results.
177
+
178
+ Args:
179
+ query: The search query
180
+
181
+ Returns:
182
+ List of preview dictionaries with at least 'id', 'title', and 'snippet' keys
183
+ """
184
+ pass
185
+
186
+ @abstractmethod
187
+ def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
188
+ """
189
+ Get full content for the relevant items.
190
+
191
+ Args:
192
+ relevant_items: List of relevant preview dictionaries
193
+
194
+ Returns:
195
+ List of result dictionaries with full content
196
+ """
197
+ pass