local-deep-research 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/config.py +8 -8
- local_deep_research/defaults/search_engines.toml +39 -18
- local_deep_research/search_system.py +15 -9
- local_deep_research/utilties/enums.py +4 -4
- local_deep_research/web/app.py +3 -2
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +3 -5
- local_deep_research/web_search_engines/engines/search_engine_brave.py +3 -5
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +4 -3
- local_deep_research/web_search_engines/engines/search_engine_github.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +323 -78
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +3 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +3 -4
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +3 -2
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +1128 -0
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +2 -4
- local_deep_research/web_search_engines/search_engine_base.py +12 -4
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/METADATA +1 -1
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/RECORD +25 -25
- local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +0 -623
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/WHEEL +0 -0
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,20 @@
|
|
1
1
|
import requests
|
2
|
-
|
2
|
+
import logging
|
3
|
+
from typing import Dict, List, Any, Optional, Tuple
|
3
4
|
import os
|
4
5
|
from datetime import datetime, timedelta
|
5
6
|
from langchain_core.language_models import BaseLLM
|
6
7
|
|
7
8
|
from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
|
8
9
|
from local_deep_research import config
|
10
|
+
from local_deep_research.utilties.search_utilities import remove_think_tags
|
9
11
|
|
12
|
+
# Setup logging
|
13
|
+
logging.basicConfig(level=logging.INFO)
|
14
|
+
logger = logging.getLogger(__name__)
|
10
15
|
|
11
16
|
class GuardianSearchEngine(BaseSearchEngine):
|
12
|
-
"""
|
17
|
+
"""Enhanced Guardian API search engine implementation with LLM query optimization"""
|
13
18
|
|
14
19
|
def __init__(self,
|
15
20
|
max_results: int = 10,
|
@@ -18,9 +23,12 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
18
23
|
to_date: Optional[str] = None,
|
19
24
|
section: Optional[str] = None,
|
20
25
|
order_by: str = "relevance",
|
21
|
-
llm: Optional[BaseLLM] = None
|
26
|
+
llm: Optional[BaseLLM] = None,
|
27
|
+
max_filtered_results: Optional[int] = None,
|
28
|
+
optimize_queries: bool = True,
|
29
|
+
adaptive_search: bool = True):
|
22
30
|
"""
|
23
|
-
Initialize The Guardian search engine.
|
31
|
+
Initialize The Guardian search engine with enhanced features.
|
24
32
|
|
25
33
|
Args:
|
26
34
|
max_results: Maximum number of search results
|
@@ -29,13 +37,16 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
29
37
|
to_date: End date for search (YYYY-MM-DD format, default today)
|
30
38
|
section: Filter by section (e.g., "politics", "technology", "sport")
|
31
39
|
order_by: Sort order ("relevance", "newest", "oldest")
|
32
|
-
llm: Language model for relevance filtering
|
40
|
+
llm: Language model for relevance filtering and query optimization
|
41
|
+
max_filtered_results: Maximum number of results to keep after filtering
|
42
|
+
optimize_queries: Whether to optimize queries using LLM
|
43
|
+
adaptive_search: Whether to use adaptive search (adjusting date ranges)
|
33
44
|
"""
|
34
|
-
# Initialize the BaseSearchEngine with
|
35
|
-
super().__init__(llm=llm)
|
36
|
-
|
37
|
-
self.max_results = max_results
|
45
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
46
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
38
47
|
self.api_key = api_key or os.getenv("GUARDIAN_API_KEY")
|
48
|
+
self.optimize_queries = optimize_queries
|
49
|
+
self.adaptive_search = adaptive_search
|
39
50
|
|
40
51
|
if not self.api_key:
|
41
52
|
raise ValueError("Guardian API key not found. Please provide api_key or set the GUARDIAN_API_KEY environment variable.")
|
@@ -56,10 +67,203 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
56
67
|
|
57
68
|
self.section = section
|
58
69
|
self.order_by = order_by
|
70
|
+
self._original_date_params = {
|
71
|
+
"from_date": self.from_date,
|
72
|
+
"to_date": self.to_date
|
73
|
+
}
|
59
74
|
|
60
75
|
# API base URL
|
61
76
|
self.api_url = "https://content.guardianapis.com/search"
|
62
77
|
|
78
|
+
def _optimize_query_for_guardian(self, query: str) -> str:
|
79
|
+
"""
|
80
|
+
Optimize a natural language query for Guardian search.
|
81
|
+
Uses LLM to transform questions into effective news search queries.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
query: Natural language query
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
Optimized query string for Guardian
|
88
|
+
"""
|
89
|
+
# Handle extremely long queries by truncating first
|
90
|
+
if len(query) > 150:
|
91
|
+
simple_query = " ".join(query.split()[:10])
|
92
|
+
logger.info(f"Query too long ({len(query)} chars), truncating to: {simple_query}")
|
93
|
+
query = simple_query
|
94
|
+
|
95
|
+
if not self.llm or not self.optimize_queries:
|
96
|
+
# Return original query if no LLM available or optimization disabled
|
97
|
+
return query
|
98
|
+
|
99
|
+
try:
|
100
|
+
# Prompt for query optimization
|
101
|
+
prompt = f"""Transform this natural language question into a very short Guardian news search query.
|
102
|
+
|
103
|
+
Original query: "{query}"
|
104
|
+
|
105
|
+
CRITICAL RULES:
|
106
|
+
1. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
|
107
|
+
2. Keep it EXTREMELY BRIEF - MAXIMUM 3-4 words total
|
108
|
+
3. Focus only on the main topic/person/event
|
109
|
+
4. Include proper names when relevant
|
110
|
+
5. Remove ALL unnecessary words
|
111
|
+
6. DO NOT use Boolean operators (no AND/OR)
|
112
|
+
7. DO NOT use quotes
|
113
|
+
|
114
|
+
EXAMPLE CONVERSIONS:
|
115
|
+
✓ "What's the impact of rising interest rates on UK housing market?" → "UK housing rates"
|
116
|
+
✓ "Latest developments in the Ukraine-Russia peace negotiations" → "Ukraine Russia negotiations"
|
117
|
+
✓ "How are tech companies responding to AI regulation?" → "tech AI regulation"
|
118
|
+
✓ "What is Donald Trump's current political activity?" → "Trump political activity"
|
119
|
+
|
120
|
+
Return ONLY the extremely brief search query.
|
121
|
+
"""
|
122
|
+
|
123
|
+
# Get response from LLM
|
124
|
+
response = self.llm.invoke(prompt)
|
125
|
+
optimized_query = remove_think_tags(response.content).strip()
|
126
|
+
|
127
|
+
# Clean up the query - remove any explanations
|
128
|
+
lines = optimized_query.split('\n')
|
129
|
+
for line in lines:
|
130
|
+
line = line.strip()
|
131
|
+
if line and not line.lower().startswith(('here', 'i would', 'the best', 'this query')):
|
132
|
+
optimized_query = line
|
133
|
+
break
|
134
|
+
|
135
|
+
# Remove any quotes that wrap the entire query
|
136
|
+
if optimized_query.startswith('"') and optimized_query.endswith('"') and optimized_query.count('"') == 2:
|
137
|
+
optimized_query = optimized_query[1:-1]
|
138
|
+
|
139
|
+
logger.info(f"Original query: '{query}'")
|
140
|
+
logger.info(f"Optimized for Guardian: '{optimized_query}'")
|
141
|
+
|
142
|
+
return optimized_query
|
143
|
+
|
144
|
+
except Exception as e:
|
145
|
+
logger.error(f"Error optimizing query: {e}")
|
146
|
+
return query # Fall back to original query on error
|
147
|
+
|
148
|
+
def _adapt_dates_for_query_type(self, query: str) -> None:
|
149
|
+
"""
|
150
|
+
Adapt date range based on query type (historical vs current).
|
151
|
+
|
152
|
+
Args:
|
153
|
+
query: The search query
|
154
|
+
"""
|
155
|
+
# Fast path - for very short queries, default to recent news
|
156
|
+
if len(query.split()) <= 4:
|
157
|
+
logger.info("Short query detected, defaulting to recent news")
|
158
|
+
# Default to 60 days for short queries
|
159
|
+
recent = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
|
160
|
+
self.from_date = recent
|
161
|
+
self.order_by = "newest"
|
162
|
+
return
|
163
|
+
|
164
|
+
if not self.llm or not self.adaptive_search:
|
165
|
+
return
|
166
|
+
|
167
|
+
try:
|
168
|
+
prompt = f"""Is this query asking about HISTORICAL events or CURRENT events?
|
169
|
+
|
170
|
+
Query: "{query}"
|
171
|
+
|
172
|
+
ONE WORD ANSWER ONLY:
|
173
|
+
- "HISTORICAL" if about past events (older than 1 year)
|
174
|
+
- "CURRENT" if about recent events (within past year)
|
175
|
+
- "UNCLEAR" if can't determine
|
176
|
+
|
177
|
+
ONE WORD ONLY:"""
|
178
|
+
|
179
|
+
response = self.llm.invoke(prompt)
|
180
|
+
answer = remove_think_tags(response.content).strip().upper()
|
181
|
+
|
182
|
+
# Reset to original parameters first
|
183
|
+
self.from_date = self._original_date_params["from_date"]
|
184
|
+
self.to_date = self._original_date_params["to_date"]
|
185
|
+
|
186
|
+
if "HISTORICAL" in answer:
|
187
|
+
# For historical queries, go back 10 years
|
188
|
+
logger.info("Query classified as HISTORICAL - extending search timeframe")
|
189
|
+
ten_years_ago = (datetime.now() - timedelta(days=3650)).strftime("%Y-%m-%d")
|
190
|
+
self.from_date = ten_years_ago
|
191
|
+
|
192
|
+
elif "CURRENT" in answer:
|
193
|
+
# For current events, focus on recent content
|
194
|
+
logger.info("Query classified as CURRENT - focusing on recent content")
|
195
|
+
recent = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
|
196
|
+
self.from_date = recent
|
197
|
+
self.order_by = "newest" # Prioritize newest for current events
|
198
|
+
|
199
|
+
except Exception as e:
|
200
|
+
logger.error(f"Error adapting dates for query type: {e}")
|
201
|
+
# Keep original date parameters on error
|
202
|
+
|
203
|
+
def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
|
204
|
+
"""
|
205
|
+
Perform adaptive search that progressively adjusts parameters based on results.
|
206
|
+
|
207
|
+
Args:
|
208
|
+
query: The search query
|
209
|
+
|
210
|
+
Returns:
|
211
|
+
Tuple of (list of articles, search strategy used)
|
212
|
+
"""
|
213
|
+
# Try with current parameters
|
214
|
+
articles = self._get_all_data(query)
|
215
|
+
strategy = "initial"
|
216
|
+
|
217
|
+
# If no results or too few, try different strategies
|
218
|
+
if len(articles) < 3 and self.adaptive_search:
|
219
|
+
logger.info(f"Initial search found only {len(articles)} results, trying alternative strategies")
|
220
|
+
|
221
|
+
# Try with expanded date range
|
222
|
+
original_from_date = self.from_date
|
223
|
+
original_order_by = self.order_by
|
224
|
+
|
225
|
+
# Strategy 1: Expand to 6 months
|
226
|
+
logger.info("Strategy 1: Expanding time range to 6 months")
|
227
|
+
six_months_ago = (datetime.now() - timedelta(days=180)).strftime("%Y-%m-%d")
|
228
|
+
self.from_date = six_months_ago
|
229
|
+
|
230
|
+
articles1 = self._get_all_data(query)
|
231
|
+
if len(articles1) > len(articles):
|
232
|
+
articles = articles1
|
233
|
+
strategy = "expanded_6mo"
|
234
|
+
|
235
|
+
# Strategy 2: Expand to all time and try relevance order
|
236
|
+
if len(articles) < 3:
|
237
|
+
logger.info("Strategy 2: Expanding to all time with relevance ordering")
|
238
|
+
self.from_date = "2000-01-01" # Effectively "all time"
|
239
|
+
self.order_by = "relevance"
|
240
|
+
|
241
|
+
articles2 = self._get_all_data(query)
|
242
|
+
if len(articles2) > len(articles):
|
243
|
+
articles = articles2
|
244
|
+
strategy = "all_time_relevance"
|
245
|
+
|
246
|
+
# Strategy 3: Try removing section constraints
|
247
|
+
if len(articles) < 3 and self.section:
|
248
|
+
logger.info("Strategy 3: Removing section constraint")
|
249
|
+
original_section = self.section
|
250
|
+
self.section = None
|
251
|
+
|
252
|
+
articles3 = self._get_all_data(query)
|
253
|
+
if len(articles3) > len(articles):
|
254
|
+
articles = articles3
|
255
|
+
strategy = "no_section"
|
256
|
+
|
257
|
+
# Restore section setting
|
258
|
+
self.section = original_section
|
259
|
+
|
260
|
+
# Restore original settings
|
261
|
+
self.from_date = original_from_date
|
262
|
+
self.order_by = original_order_by
|
263
|
+
|
264
|
+
logger.info(f"Adaptive search using strategy '{strategy}' found {len(articles)} results")
|
265
|
+
return articles, strategy
|
266
|
+
|
63
267
|
def _get_all_data(self, query: str) -> List[Dict[str, Any]]:
|
64
268
|
"""
|
65
269
|
Get all article data from The Guardian API in a single call.
|
@@ -72,14 +276,31 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
72
276
|
List of articles with all data
|
73
277
|
"""
|
74
278
|
try:
|
279
|
+
# Ensure query is not empty
|
280
|
+
if not query or query.strip() == "":
|
281
|
+
query = "news"
|
282
|
+
logger.warning("Empty query provided, using 'news' as default")
|
283
|
+
|
284
|
+
# Ensure query is not too long for API
|
285
|
+
if len(query) > 100:
|
286
|
+
logger.warning(f"Query too long for Guardian API ({len(query)} chars), truncating")
|
287
|
+
query = query[:100]
|
288
|
+
|
75
289
|
# Always request all fields for simplicity
|
290
|
+
# Ensure max_results is an integer to avoid comparison errors
|
291
|
+
page_size = min(int(self.max_results) if self.max_results is not None else 10, 50)
|
292
|
+
|
293
|
+
# Log full parameters for debugging
|
294
|
+
logger.info(f"Guardian API search query: '{query}'")
|
295
|
+
logger.info(f"Guardian API date range: {self.from_date} to {self.to_date}")
|
296
|
+
|
76
297
|
params = {
|
77
298
|
"q": query,
|
78
299
|
"api-key": self.api_key,
|
79
300
|
"from-date": self.from_date,
|
80
301
|
"to-date": self.to_date,
|
81
302
|
"order-by": self.order_by,
|
82
|
-
"page-size":
|
303
|
+
"page-size": page_size, # API maximum is 50
|
83
304
|
"show-fields": "headline,trailText,byline,body,publication",
|
84
305
|
"show-tags": "keyword"
|
85
306
|
}
|
@@ -88,6 +309,11 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
88
309
|
if self.section:
|
89
310
|
params["section"] = self.section
|
90
311
|
|
312
|
+
# Log the complete request parameters (except API key)
|
313
|
+
log_params = params.copy()
|
314
|
+
log_params["api-key"] = "REDACTED"
|
315
|
+
logger.info(f"Guardian API request parameters: {log_params}")
|
316
|
+
|
91
317
|
# Execute the API request
|
92
318
|
response = requests.get(self.api_url, params=params)
|
93
319
|
response.raise_for_status()
|
@@ -96,6 +322,7 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
96
322
|
|
97
323
|
# Extract results from the response
|
98
324
|
articles = data.get("response", {}).get("results", [])
|
325
|
+
logger.info(f"Guardian API returned {len(articles)} articles")
|
99
326
|
|
100
327
|
# Format results to include all data
|
101
328
|
formatted_articles = []
|
@@ -127,13 +354,12 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
127
354
|
return formatted_articles
|
128
355
|
|
129
356
|
except Exception as e:
|
130
|
-
|
357
|
+
logger.error(f"Error getting data from The Guardian API: {e}")
|
131
358
|
return []
|
132
359
|
|
133
360
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
134
361
|
"""
|
135
|
-
Get preview information for Guardian articles.
|
136
|
-
Actually gets all data but returns only preview fields.
|
362
|
+
Get preview information for Guardian articles with enhanced optimization.
|
137
363
|
|
138
364
|
Args:
|
139
365
|
query: The search query
|
@@ -141,12 +367,29 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
141
367
|
Returns:
|
142
368
|
List of preview dictionaries
|
143
369
|
"""
|
144
|
-
|
370
|
+
logger.info(f"Getting articles from The Guardian API for query: {query}")
|
145
371
|
|
146
|
-
#
|
147
|
-
|
372
|
+
# Step 1: Optimize the query using LLM
|
373
|
+
optimized_query = self._optimize_query_for_guardian(query)
|
374
|
+
|
375
|
+
# Step 2: Adapt date parameters based on query type
|
376
|
+
self._adapt_dates_for_query_type(optimized_query)
|
377
|
+
|
378
|
+
# Step 3: Perform adaptive search
|
379
|
+
articles, strategy = self._adaptive_search(optimized_query)
|
148
380
|
|
149
|
-
# Store
|
381
|
+
# Store search metadata for debugging
|
382
|
+
self._search_metadata = {
|
383
|
+
"original_query": query,
|
384
|
+
"optimized_query": optimized_query,
|
385
|
+
"strategy": strategy,
|
386
|
+
"from_date": self.from_date,
|
387
|
+
"to_date": self.to_date,
|
388
|
+
"section": self.section,
|
389
|
+
"order_by": self.order_by
|
390
|
+
}
|
391
|
+
|
392
|
+
# Store full articles for later use
|
150
393
|
self._full_articles = {a["id"]: a for a in articles}
|
151
394
|
|
152
395
|
# Return only preview fields for each article
|
@@ -177,7 +420,7 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
177
420
|
Returns:
|
178
421
|
List of result dictionaries with full content
|
179
422
|
"""
|
180
|
-
|
423
|
+
logger.info(f"Adding full content to {len(relevant_items)} relevant Guardian articles")
|
181
424
|
|
182
425
|
# Check if we should add full content
|
183
426
|
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
@@ -199,7 +442,7 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
199
442
|
|
200
443
|
def run(self, query: str) -> List[Dict[str, Any]]:
|
201
444
|
"""
|
202
|
-
Execute a search using The Guardian API with the
|
445
|
+
Execute a search using The Guardian API with the enhanced approach.
|
203
446
|
|
204
447
|
Args:
|
205
448
|
query: The search query
|
@@ -207,75 +450,77 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
207
450
|
Returns:
|
208
451
|
List of search results
|
209
452
|
"""
|
210
|
-
|
453
|
+
logger.info(f"---Execute a search using The Guardian (enhanced)---")
|
211
454
|
|
212
|
-
#
|
213
|
-
|
455
|
+
# Additional safety check for None query
|
456
|
+
if query is None:
|
457
|
+
logger.error("None query passed to Guardian search engine")
|
458
|
+
query = "news"
|
214
459
|
|
215
|
-
|
216
|
-
|
217
|
-
|
460
|
+
try:
|
461
|
+
# Get previews with our enhanced method
|
462
|
+
previews = self._get_previews(query)
|
218
463
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
464
|
+
# If no results, try one more time with a simplified query
|
465
|
+
if not previews:
|
466
|
+
simple_query = " ".join([w for w in query.split() if len(w) > 3][:3])
|
467
|
+
logger.warning(f"No Guardian articles found, trying simplified query: {simple_query}")
|
468
|
+
previews = self._get_previews(simple_query)
|
469
|
+
|
470
|
+
# If still no results, try with a very generic query as last resort
|
471
|
+
if not previews and "trump" in query.lower():
|
472
|
+
logger.warning("Trying last resort query: 'Donald Trump'")
|
473
|
+
previews = self._get_previews("Donald Trump")
|
474
|
+
elif not previews:
|
475
|
+
logger.warning("Trying last resort query: 'news'")
|
476
|
+
previews = self._get_previews("news")
|
227
477
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
# Guardian article API URL
|
233
|
-
url = f"https://content.guardianapis.com/{article_id}"
|
234
|
-
|
235
|
-
# Always request all fields
|
236
|
-
response = requests.get(
|
237
|
-
url,
|
238
|
-
params={
|
239
|
-
"api-key": self.api_key,
|
240
|
-
"show-fields": "headline,trailText,body,byline,publication",
|
241
|
-
"show-tags": "keyword"
|
242
|
-
}
|
243
|
-
)
|
244
|
-
response.raise_for_status()
|
478
|
+
# If still no results after all attempts, return empty list
|
479
|
+
if not previews:
|
480
|
+
logger.warning(f"No Guardian articles found after multiple attempts")
|
481
|
+
return []
|
245
482
|
|
246
|
-
|
247
|
-
|
483
|
+
# Filter for relevance if we have an LLM
|
484
|
+
if self.llm and hasattr(self, 'max_filtered_results') and self.max_filtered_results:
|
485
|
+
filtered_items = self._filter_for_relevance(previews, query)
|
486
|
+
if not filtered_items:
|
487
|
+
# Fall back to unfiltered results if everything was filtered out
|
488
|
+
logger.warning("All articles filtered out, using unfiltered results")
|
489
|
+
filtered_items = previews[:self.max_filtered_results]
|
490
|
+
else:
|
491
|
+
filtered_items = previews
|
248
492
|
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
fields = article.get("fields", {})
|
253
|
-
|
254
|
-
# Format the article with all fields
|
255
|
-
result = {
|
256
|
-
"id": article_id,
|
257
|
-
"title": fields.get("headline", article.get("webTitle", "")),
|
258
|
-
"link": article.get("webUrl", ""),
|
259
|
-
"snippet": fields.get("trailText", ""),
|
260
|
-
"publication_date": article.get("webPublicationDate", ""),
|
261
|
-
"section": article.get("sectionName", ""),
|
262
|
-
"author": fields.get("byline", "")
|
263
|
-
}
|
493
|
+
# Get full content for relevant items
|
494
|
+
results = self._get_full_content(filtered_items)
|
264
495
|
|
265
|
-
#
|
266
|
-
|
267
|
-
|
268
|
-
|
496
|
+
# Add source information to make it clear these are from The Guardian
|
497
|
+
for result in results:
|
498
|
+
if "source" not in result:
|
499
|
+
result["source"] = "The Guardian"
|
269
500
|
|
270
|
-
#
|
271
|
-
|
272
|
-
|
501
|
+
# Clean up the cache after use
|
502
|
+
if hasattr(self, '_full_articles'):
|
503
|
+
del self._full_articles
|
504
|
+
|
505
|
+
# Restore original date parameters
|
506
|
+
self.from_date = self._original_date_params["from_date"]
|
507
|
+
self.to_date = self._original_date_params["to_date"]
|
273
508
|
|
274
|
-
|
509
|
+
# Log search metadata if available
|
510
|
+
if hasattr(self, '_search_metadata'):
|
511
|
+
logger.info(f"Search metadata: {self._search_metadata}")
|
512
|
+
del self._search_metadata
|
513
|
+
|
514
|
+
return results
|
275
515
|
|
276
516
|
except Exception as e:
|
277
|
-
|
278
|
-
|
517
|
+
logger.error(f"Error in Guardian search: {e}")
|
518
|
+
|
519
|
+
# Restore original date parameters on error
|
520
|
+
self.from_date = self._original_date_params["from_date"]
|
521
|
+
self.to_date = self._original_date_params["to_date"]
|
522
|
+
|
523
|
+
return []
|
279
524
|
|
280
525
|
def search_by_section(self, section: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
281
526
|
"""
|
@@ -283,7 +528,7 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
283
528
|
|
284
529
|
Args:
|
285
530
|
section: The Guardian section name (e.g., "politics", "technology")
|
286
|
-
max_results: Maximum number of
|
531
|
+
max_results: Maximum number of results (defaults to self.max_results)
|
287
532
|
|
288
533
|
Returns:
|
289
534
|
List of articles in the section
|
@@ -35,11 +35,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
|
|
35
35
|
max_filtered_results: Maximum results after filtering
|
36
36
|
**kwargs: Additional parameters passed to LocalSearchEngine instances
|
37
37
|
"""
|
38
|
-
# Initialize the
|
39
|
-
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
40
|
-
|
41
|
-
self.max_results = max_results
|
42
|
-
|
38
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
39
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
40
|
+
|
43
41
|
# Find all local collection search engines
|
44
42
|
self.local_engines = {}
|
45
43
|
try:
|
@@ -44,10 +44,9 @@ class PubMedSearchEngine(BaseSearchEngine):
|
|
44
44
|
max_filtered_results: Maximum number of results to keep after filtering
|
45
45
|
optimize_queries: Whether to optimize natural language queries for PubMed
|
46
46
|
"""
|
47
|
-
# Initialize the BaseSearchEngine with
|
48
|
-
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
49
|
-
|
50
|
-
self.max_results = max_results
|
47
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
48
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
49
|
+
self.max_results=max(self.max_results,25)
|
51
50
|
self.api_key = api_key
|
52
51
|
self.days_limit = days_limit
|
53
52
|
self.get_abstracts = get_abstracts
|
@@ -51,8 +51,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
51
51
|
include_full_content: Whether to include full webpage content in results
|
52
52
|
api_key: Alternative way to provide instance URL (takes precedence over instance_url)
|
53
53
|
"""
|
54
|
-
|
55
|
-
|
54
|
+
|
55
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
56
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
56
57
|
|
57
58
|
# Get instance URL from various sources in priority order:
|
58
59
|
# 1. api_key parameter (which is actually the instance URL)
|