local-deep-research 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +24 -0
- local_deep_research/citation_handler.py +113 -0
- local_deep_research/config.py +166 -0
- local_deep_research/defaults/__init__.py +44 -0
- local_deep_research/defaults/llm_config.py +269 -0
- local_deep_research/defaults/local_collections.toml +47 -0
- local_deep_research/defaults/main.toml +57 -0
- local_deep_research/defaults/search_engines.toml +244 -0
- local_deep_research/local_collections.py +141 -0
- local_deep_research/main.py +113 -0
- local_deep_research/report_generator.py +206 -0
- local_deep_research/search_system.py +241 -0
- local_deep_research/utilties/__init__.py +0 -0
- local_deep_research/utilties/enums.py +9 -0
- local_deep_research/utilties/llm_utils.py +116 -0
- local_deep_research/utilties/search_utilities.py +115 -0
- local_deep_research/utilties/setup_utils.py +6 -0
- local_deep_research/web/__init__.py +2 -0
- local_deep_research/web/app.py +1209 -0
- local_deep_research/web/static/css/styles.css +1008 -0
- local_deep_research/web/static/js/app.js +2078 -0
- local_deep_research/web/templates/api_keys_config.html +82 -0
- local_deep_research/web/templates/collections_config.html +90 -0
- local_deep_research/web/templates/index.html +312 -0
- local_deep_research/web/templates/llm_config.html +120 -0
- local_deep_research/web/templates/main_config.html +89 -0
- local_deep_research/web/templates/search_engines_config.html +154 -0
- local_deep_research/web/templates/settings.html +519 -0
- local_deep_research/web/templates/settings_dashboard.html +207 -0
- local_deep_research/web_search_engines/__init__.py +0 -0
- local_deep_research/web_search_engines/engines/__init__.py +0 -0
- local_deep_research/web_search_engines/engines/full_search.py +128 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
- local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
- local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
- local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
- local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
- local_deep_research/web_search_engines/full_search.py +254 -0
- local_deep_research/web_search_engines/search_engine_base.py +197 -0
- local_deep_research/web_search_engines/search_engine_factory.py +233 -0
- local_deep_research/web_search_engines/search_engines_config.py +54 -0
- local_deep_research-0.1.0.dist-info/LICENSE +21 -0
- local_deep_research-0.1.0.dist-info/METADATA +328 -0
- local_deep_research-0.1.0.dist-info/RECORD +56 -0
- local_deep_research-0.1.0.dist-info/WHEEL +5 -0
- local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
- local_deep_research-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,283 @@
|
|
1
|
+
from typing import Dict, List, Any, Optional
|
2
|
+
import os
|
3
|
+
import requests
|
4
|
+
import time
|
5
|
+
import random
|
6
|
+
import logging
|
7
|
+
from requests.exceptions import RequestException
|
8
|
+
from urllib.parse import quote_plus
|
9
|
+
from langchain_core.language_models import BaseLLM
|
10
|
+
|
11
|
+
from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
|
12
|
+
|
13
|
+
# Set up logging
|
14
|
+
logging.basicConfig(level=logging.INFO)
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
class GooglePSESearchEngine(BaseSearchEngine):
|
18
|
+
"""Google Programmable Search Engine implementation"""
|
19
|
+
|
20
|
+
def __init__(self,
|
21
|
+
max_results: int = 10,
|
22
|
+
region: str = "us",
|
23
|
+
safe_search: bool = True,
|
24
|
+
search_language: str = "English",
|
25
|
+
api_key: Optional[str] = None,
|
26
|
+
search_engine_id: Optional[str] = None,
|
27
|
+
llm: Optional[BaseLLM] = None,
|
28
|
+
include_full_content: bool = False,
|
29
|
+
max_filtered_results: Optional[int] = None,
|
30
|
+
max_retries: int = 3,
|
31
|
+
retry_delay: float = 2.0,
|
32
|
+
**kwargs):
|
33
|
+
"""
|
34
|
+
Initialize the Google Programmable Search Engine.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
max_results: Maximum number of search results
|
38
|
+
region: Region code for search results
|
39
|
+
safe_search: Whether to enable safe search
|
40
|
+
search_language: Language for search results
|
41
|
+
api_key: Google API key (can also be set in GOOGLE_PSE_API_KEY env)
|
42
|
+
search_engine_id: Google CSE ID (can also be set in GOOGLE_PSE_ENGINE_ID env)
|
43
|
+
llm: Language model for relevance filtering
|
44
|
+
include_full_content: Whether to include full webpage content in results
|
45
|
+
max_filtered_results: Maximum number of results to keep after filtering
|
46
|
+
max_retries: Maximum number of retry attempts for API requests
|
47
|
+
retry_delay: Base delay in seconds between retry attempts
|
48
|
+
**kwargs: Additional parameters (ignored but accepted for compatibility)
|
49
|
+
"""
|
50
|
+
# Initialize the BaseSearchEngine with the LLM and max_filtered_results
|
51
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
52
|
+
|
53
|
+
self.max_results = max_results
|
54
|
+
self.include_full_content = include_full_content
|
55
|
+
|
56
|
+
# Retry configuration
|
57
|
+
self.max_retries = max_retries
|
58
|
+
self.retry_delay = retry_delay
|
59
|
+
|
60
|
+
# Rate limiting - keep track of last request time
|
61
|
+
self.last_request_time = 0
|
62
|
+
self.min_request_interval = 0.5 # Minimum time between requests in seconds
|
63
|
+
|
64
|
+
# Language code mapping
|
65
|
+
language_code_mapping = {
|
66
|
+
"english": "en",
|
67
|
+
"spanish": "es",
|
68
|
+
"french": "fr",
|
69
|
+
"german": "de",
|
70
|
+
"italian": "it",
|
71
|
+
"japanese": "ja",
|
72
|
+
"korean": "ko",
|
73
|
+
"portuguese": "pt",
|
74
|
+
"russian": "ru",
|
75
|
+
"chinese": "zh-CN"
|
76
|
+
}
|
77
|
+
|
78
|
+
# Get language code
|
79
|
+
search_language = search_language.lower()
|
80
|
+
self.language = language_code_mapping.get(search_language, "en")
|
81
|
+
|
82
|
+
# Safe search setting
|
83
|
+
self.safe = "active" if safe_search else "off"
|
84
|
+
|
85
|
+
# Region/Country setting
|
86
|
+
self.region = region
|
87
|
+
|
88
|
+
# API key and Search Engine ID
|
89
|
+
self.api_key = api_key or os.getenv("GOOGLE_PSE_API_KEY")
|
90
|
+
self.search_engine_id = search_engine_id or os.getenv("GOOGLE_PSE_ENGINE_ID")
|
91
|
+
|
92
|
+
if not self.api_key:
|
93
|
+
raise ValueError("Google API key is required. Set it in the GOOGLE_PSE_API_KEY environment variable.")
|
94
|
+
if not self.search_engine_id:
|
95
|
+
raise ValueError("Google Search Engine ID is required. Set it in the GOOGLE_PSE_ENGINE_ID environment variable.")
|
96
|
+
|
97
|
+
# Validate connection and credentials
|
98
|
+
self._validate_connection()
|
99
|
+
|
100
|
+
def _validate_connection(self):
|
101
|
+
"""Test the connection to ensure API key and Search Engine ID are valid"""
|
102
|
+
try:
|
103
|
+
# Make a minimal test query
|
104
|
+
response = self._make_request("test")
|
105
|
+
|
106
|
+
# Check if we got a valid response
|
107
|
+
if response.get("error"):
|
108
|
+
error_msg = response["error"].get("message", "Unknown error")
|
109
|
+
raise ValueError(f"Google PSE API error: {error_msg}")
|
110
|
+
|
111
|
+
# If we get here, the connection is valid
|
112
|
+
logger.info("Google PSE connection validated successfully")
|
113
|
+
return True
|
114
|
+
|
115
|
+
except Exception as e:
|
116
|
+
# Log the error and re-raise
|
117
|
+
logger.error(f"Error validating Google PSE connection: {str(e)}")
|
118
|
+
raise
|
119
|
+
|
120
|
+
def _respect_rate_limit(self):
|
121
|
+
"""Ensure we don't exceed rate limits by adding appropriate delay between requests"""
|
122
|
+
current_time = time.time()
|
123
|
+
elapsed = current_time - self.last_request_time
|
124
|
+
|
125
|
+
# If we've made a request recently, wait until the minimum interval has passed
|
126
|
+
if elapsed < self.min_request_interval:
|
127
|
+
sleep_time = self.min_request_interval - elapsed
|
128
|
+
logger.debug(f"Rate limiting: sleeping for {sleep_time:.2f}s")
|
129
|
+
time.sleep(sleep_time)
|
130
|
+
|
131
|
+
# Update the last request time
|
132
|
+
self.last_request_time = time.time()
|
133
|
+
|
134
|
+
def _make_request(self, query: str, start_index: int = 1) -> Dict:
|
135
|
+
"""
|
136
|
+
Make a request to the Google PSE API with retry logic and rate limiting
|
137
|
+
|
138
|
+
Args:
|
139
|
+
query: Search query string
|
140
|
+
start_index: Starting index for pagination
|
141
|
+
|
142
|
+
Returns:
|
143
|
+
JSON response from the API
|
144
|
+
|
145
|
+
Raises:
|
146
|
+
RequestException: If all retry attempts fail
|
147
|
+
"""
|
148
|
+
# Base URL for the API
|
149
|
+
url = "https://www.googleapis.com/customsearch/v1"
|
150
|
+
|
151
|
+
# Parameters for the request
|
152
|
+
params = {
|
153
|
+
"key": self.api_key,
|
154
|
+
"cx": self.search_engine_id,
|
155
|
+
"q": query,
|
156
|
+
"num": min(10, self.max_results), # Max 10 per request
|
157
|
+
"start": start_index,
|
158
|
+
"safe": self.safe,
|
159
|
+
"lr": f"lang_{self.language}",
|
160
|
+
"gl": self.region
|
161
|
+
}
|
162
|
+
|
163
|
+
# Implement retry logic with exponential backoff
|
164
|
+
attempt = 0
|
165
|
+
last_exception = None
|
166
|
+
|
167
|
+
while attempt < self.max_retries:
|
168
|
+
try:
|
169
|
+
# Respect rate limits
|
170
|
+
self._respect_rate_limit()
|
171
|
+
|
172
|
+
# Add jitter to retries after the first attempt
|
173
|
+
if attempt > 0:
|
174
|
+
jitter = random.uniform(0.5, 1.5)
|
175
|
+
sleep_time = self.retry_delay * (2 ** (attempt - 1)) * jitter
|
176
|
+
logger.info(f"Retry attempt {attempt+1}/{self.max_retries} for query '{query}'. Waiting {sleep_time:.2f}s")
|
177
|
+
time.sleep(sleep_time)
|
178
|
+
|
179
|
+
# Make the request
|
180
|
+
logger.debug(f"Making request to Google PSE API: {query} (start_index={start_index})")
|
181
|
+
response = requests.get(url, params=params, timeout=10)
|
182
|
+
|
183
|
+
# Check for HTTP errors
|
184
|
+
response.raise_for_status()
|
185
|
+
|
186
|
+
# Return the JSON response
|
187
|
+
return response.json()
|
188
|
+
|
189
|
+
except RequestException as e:
|
190
|
+
logger.warning(f"Request error on attempt {attempt+1}/{self.max_retries}: {str(e)}")
|
191
|
+
last_exception = e
|
192
|
+
except Exception as e:
|
193
|
+
logger.warning(f"Error on attempt {attempt+1}/{self.max_retries}: {str(e)}")
|
194
|
+
last_exception = e
|
195
|
+
|
196
|
+
attempt += 1
|
197
|
+
|
198
|
+
# If we get here, all retries failed
|
199
|
+
error_msg = f"Failed to get response from Google PSE API after {self.max_retries} attempts"
|
200
|
+
logger.error(error_msg)
|
201
|
+
|
202
|
+
if last_exception:
|
203
|
+
raise RequestException(f"{error_msg}: {str(last_exception)}")
|
204
|
+
else:
|
205
|
+
raise RequestException(error_msg)
|
206
|
+
|
207
|
+
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
208
|
+
"""Get search result previews/snippets"""
|
209
|
+
results = []
|
210
|
+
|
211
|
+
# Google PSE API returns a maximum of 10 results per request
|
212
|
+
# We may need to make multiple requests to get the desired number
|
213
|
+
start_index = 1
|
214
|
+
total_results = 0
|
215
|
+
|
216
|
+
while total_results < self.max_results:
|
217
|
+
try:
|
218
|
+
response = self._make_request(query, start_index)
|
219
|
+
|
220
|
+
# Break if no items
|
221
|
+
if "items" not in response:
|
222
|
+
break
|
223
|
+
|
224
|
+
items = response.get("items", [])
|
225
|
+
|
226
|
+
# Process each result
|
227
|
+
for item in items:
|
228
|
+
title = item.get("title", "")
|
229
|
+
snippet = item.get("snippet", "")
|
230
|
+
url = item.get("link", "")
|
231
|
+
|
232
|
+
# Skip results without URL
|
233
|
+
if not url:
|
234
|
+
continue
|
235
|
+
|
236
|
+
results.append({
|
237
|
+
"title": title,
|
238
|
+
"snippet": snippet,
|
239
|
+
"url": url,
|
240
|
+
"source": "Google Programmable Search"
|
241
|
+
})
|
242
|
+
|
243
|
+
total_results += 1
|
244
|
+
if total_results >= self.max_results:
|
245
|
+
break
|
246
|
+
|
247
|
+
# Check if there are more results
|
248
|
+
if not items or total_results >= self.max_results:
|
249
|
+
break
|
250
|
+
|
251
|
+
# Update start index for next request
|
252
|
+
start_index += len(items)
|
253
|
+
|
254
|
+
# Add a small delay between multiple requests to be respectful of the API
|
255
|
+
if total_results < self.max_results:
|
256
|
+
time.sleep(self.min_request_interval)
|
257
|
+
|
258
|
+
except Exception as e:
|
259
|
+
logger.error(f"Error getting search results: {str(e)}")
|
260
|
+
break
|
261
|
+
|
262
|
+
logger.info(f"Retrieved {len(results)} search results for query: '{query}'")
|
263
|
+
return results
|
264
|
+
|
265
|
+
def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
266
|
+
"""Get full content for search results"""
|
267
|
+
# Use the BaseSearchEngine implementation
|
268
|
+
return super()._get_full_content(relevant_items)
|
269
|
+
|
270
|
+
def run(self, query: str) -> List[Dict[str, Any]]:
|
271
|
+
"""Run the search engine to get results for a query"""
|
272
|
+
# Get search result previews/snippets
|
273
|
+
search_results = self._get_previews(query)
|
274
|
+
|
275
|
+
# Filter for relevance if we have an LLM and max_filtered_results
|
276
|
+
if self.llm and self.max_filtered_results:
|
277
|
+
search_results = self._filter_for_relevance(query, search_results)
|
278
|
+
|
279
|
+
# Get full content if needed
|
280
|
+
if self.include_full_content:
|
281
|
+
search_results = self._get_full_content(search_results)
|
282
|
+
|
283
|
+
return search_results
|
@@ -0,0 +1,337 @@
|
|
1
|
+
import requests
|
2
|
+
from typing import Dict, List, Any, Optional
|
3
|
+
import os
|
4
|
+
from datetime import datetime, timedelta
|
5
|
+
from langchain_core.language_models import BaseLLM
|
6
|
+
|
7
|
+
from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
|
8
|
+
from local_deep_research import config
|
9
|
+
|
10
|
+
|
11
|
+
class GuardianSearchEngine(BaseSearchEngine):
|
12
|
+
"""The Guardian API search engine implementation"""
|
13
|
+
|
14
|
+
def __init__(self,
|
15
|
+
max_results: int = 10,
|
16
|
+
api_key: Optional[str] = None,
|
17
|
+
from_date: Optional[str] = None,
|
18
|
+
to_date: Optional[str] = None,
|
19
|
+
section: Optional[str] = None,
|
20
|
+
order_by: str = "relevance",
|
21
|
+
llm: Optional[BaseLLM] = None):
|
22
|
+
"""
|
23
|
+
Initialize The Guardian search engine.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
max_results: Maximum number of search results
|
27
|
+
api_key: The Guardian API key (can also be set in GUARDIAN_API_KEY env)
|
28
|
+
from_date: Start date for search (YYYY-MM-DD format, default 1 month ago)
|
29
|
+
to_date: End date for search (YYYY-MM-DD format, default today)
|
30
|
+
section: Filter by section (e.g., "politics", "technology", "sport")
|
31
|
+
order_by: Sort order ("relevance", "newest", "oldest")
|
32
|
+
llm: Language model for relevance filtering
|
33
|
+
"""
|
34
|
+
# Initialize the BaseSearchEngine with the LLM
|
35
|
+
super().__init__(llm=llm)
|
36
|
+
|
37
|
+
self.max_results = max_results
|
38
|
+
self.api_key = api_key or os.getenv("GUARDIAN_API_KEY")
|
39
|
+
|
40
|
+
if not self.api_key:
|
41
|
+
raise ValueError("Guardian API key not found. Please provide api_key or set the GUARDIAN_API_KEY environment variable.")
|
42
|
+
|
43
|
+
# Set date ranges if not provided
|
44
|
+
if not from_date:
|
45
|
+
# Default to one month ago
|
46
|
+
one_month_ago = datetime.now() - timedelta(days=30)
|
47
|
+
self.from_date = one_month_ago.strftime("%Y-%m-%d")
|
48
|
+
else:
|
49
|
+
self.from_date = from_date
|
50
|
+
|
51
|
+
if not to_date:
|
52
|
+
# Default to today
|
53
|
+
self.to_date = datetime.now().strftime("%Y-%m-%d")
|
54
|
+
else:
|
55
|
+
self.to_date = to_date
|
56
|
+
|
57
|
+
self.section = section
|
58
|
+
self.order_by = order_by
|
59
|
+
|
60
|
+
# API base URL
|
61
|
+
self.api_url = "https://content.guardianapis.com/search"
|
62
|
+
|
63
|
+
def _get_all_data(self, query: str) -> List[Dict[str, Any]]:
|
64
|
+
"""
|
65
|
+
Get all article data from The Guardian API in a single call.
|
66
|
+
Always requests all fields for simplicity.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
query: The search query
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
List of articles with all data
|
73
|
+
"""
|
74
|
+
try:
|
75
|
+
# Always request all fields for simplicity
|
76
|
+
params = {
|
77
|
+
"q": query,
|
78
|
+
"api-key": self.api_key,
|
79
|
+
"from-date": self.from_date,
|
80
|
+
"to-date": self.to_date,
|
81
|
+
"order-by": self.order_by,
|
82
|
+
"page-size": min(self.max_results, 50), # API maximum is 50
|
83
|
+
"show-fields": "headline,trailText,byline,body,publication",
|
84
|
+
"show-tags": "keyword"
|
85
|
+
}
|
86
|
+
|
87
|
+
# Add section filter if specified
|
88
|
+
if self.section:
|
89
|
+
params["section"] = self.section
|
90
|
+
|
91
|
+
# Execute the API request
|
92
|
+
response = requests.get(self.api_url, params=params)
|
93
|
+
response.raise_for_status()
|
94
|
+
|
95
|
+
data = response.json()
|
96
|
+
|
97
|
+
# Extract results from the response
|
98
|
+
articles = data.get("response", {}).get("results", [])
|
99
|
+
|
100
|
+
# Format results to include all data
|
101
|
+
formatted_articles = []
|
102
|
+
for i, article in enumerate(articles):
|
103
|
+
if i >= self.max_results:
|
104
|
+
break
|
105
|
+
|
106
|
+
fields = article.get("fields", {})
|
107
|
+
|
108
|
+
# Format the article with all fields
|
109
|
+
result = {
|
110
|
+
"id": article.get("id", ""),
|
111
|
+
"title": fields.get("headline", article.get("webTitle", "")),
|
112
|
+
"link": article.get("webUrl", ""),
|
113
|
+
"snippet": fields.get("trailText", ""),
|
114
|
+
"publication_date": article.get("webPublicationDate", ""),
|
115
|
+
"section": article.get("sectionName", ""),
|
116
|
+
"author": fields.get("byline", ""),
|
117
|
+
"content": fields.get("body", ""),
|
118
|
+
"full_content": fields.get("body", "")
|
119
|
+
}
|
120
|
+
|
121
|
+
# Extract tags/keywords
|
122
|
+
tags = article.get("tags", [])
|
123
|
+
result["keywords"] = [tag.get("webTitle", "") for tag in tags if tag.get("type") == "keyword"]
|
124
|
+
|
125
|
+
formatted_articles.append(result)
|
126
|
+
|
127
|
+
return formatted_articles
|
128
|
+
|
129
|
+
except Exception as e:
|
130
|
+
print(f"Error getting data from The Guardian API: {e}")
|
131
|
+
return []
|
132
|
+
|
133
|
+
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
134
|
+
"""
|
135
|
+
Get preview information for Guardian articles.
|
136
|
+
Actually gets all data but returns only preview fields.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
query: The search query
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
List of preview dictionaries
|
143
|
+
"""
|
144
|
+
print("Getting articles from The Guardian API")
|
145
|
+
|
146
|
+
# Get all article data
|
147
|
+
articles = self._get_all_data(query)
|
148
|
+
|
149
|
+
# Store full articles for later use (implementation detail)
|
150
|
+
self._full_articles = {a["id"]: a for a in articles}
|
151
|
+
|
152
|
+
# Return only preview fields for each article
|
153
|
+
previews = []
|
154
|
+
for article in articles:
|
155
|
+
preview = {
|
156
|
+
"id": article["id"],
|
157
|
+
"title": article["title"],
|
158
|
+
"link": article["link"],
|
159
|
+
"snippet": article["snippet"],
|
160
|
+
"publication_date": article["publication_date"],
|
161
|
+
"section": article["section"],
|
162
|
+
"author": article["author"],
|
163
|
+
"keywords": article.get("keywords", [])
|
164
|
+
}
|
165
|
+
previews.append(preview)
|
166
|
+
|
167
|
+
return previews
|
168
|
+
|
169
|
+
def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
170
|
+
"""
|
171
|
+
Get full content for the relevant Guardian articles.
|
172
|
+
Restores full content from the cached data.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
relevant_items: List of relevant preview dictionaries
|
176
|
+
|
177
|
+
Returns:
|
178
|
+
List of result dictionaries with full content
|
179
|
+
"""
|
180
|
+
print("Adding full content to relevant Guardian articles")
|
181
|
+
|
182
|
+
# Check if we should add full content
|
183
|
+
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
184
|
+
return relevant_items
|
185
|
+
|
186
|
+
# Get full articles for relevant items
|
187
|
+
results = []
|
188
|
+
for item in relevant_items:
|
189
|
+
article_id = item.get("id", "")
|
190
|
+
|
191
|
+
# Get the full article from our cache
|
192
|
+
if hasattr(self, '_full_articles') and article_id in self._full_articles:
|
193
|
+
results.append(self._full_articles[article_id])
|
194
|
+
else:
|
195
|
+
# If not found (shouldn't happen), just use the preview
|
196
|
+
results.append(item)
|
197
|
+
|
198
|
+
return results
|
199
|
+
|
200
|
+
def run(self, query: str) -> List[Dict[str, Any]]:
|
201
|
+
"""
|
202
|
+
Execute a search using The Guardian API with the two-phase approach.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
query: The search query
|
206
|
+
|
207
|
+
Returns:
|
208
|
+
List of search results
|
209
|
+
"""
|
210
|
+
print("---Execute a search using The Guardian---")
|
211
|
+
|
212
|
+
# Use the implementation from the parent class which handles all phases
|
213
|
+
results = super().run(query)
|
214
|
+
|
215
|
+
# Clean up the cache after use
|
216
|
+
if hasattr(self, '_full_articles'):
|
217
|
+
del self._full_articles
|
218
|
+
|
219
|
+
return results
|
220
|
+
|
221
|
+
def get_article_by_id(self, article_id: str) -> Dict[str, Any]:
|
222
|
+
"""
|
223
|
+
Get a specific article by its ID.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
article_id: The Guardian article ID
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
Dictionary with article information
|
230
|
+
"""
|
231
|
+
try:
|
232
|
+
# Guardian article API URL
|
233
|
+
url = f"https://content.guardianapis.com/{article_id}"
|
234
|
+
|
235
|
+
# Always request all fields
|
236
|
+
response = requests.get(
|
237
|
+
url,
|
238
|
+
params={
|
239
|
+
"api-key": self.api_key,
|
240
|
+
"show-fields": "headline,trailText,body,byline,publication",
|
241
|
+
"show-tags": "keyword"
|
242
|
+
}
|
243
|
+
)
|
244
|
+
response.raise_for_status()
|
245
|
+
|
246
|
+
data = response.json()
|
247
|
+
article = data.get("response", {}).get("content", {})
|
248
|
+
|
249
|
+
if not article:
|
250
|
+
return {}
|
251
|
+
|
252
|
+
fields = article.get("fields", {})
|
253
|
+
|
254
|
+
# Format the article with all fields
|
255
|
+
result = {
|
256
|
+
"id": article_id,
|
257
|
+
"title": fields.get("headline", article.get("webTitle", "")),
|
258
|
+
"link": article.get("webUrl", ""),
|
259
|
+
"snippet": fields.get("trailText", ""),
|
260
|
+
"publication_date": article.get("webPublicationDate", ""),
|
261
|
+
"section": article.get("sectionName", ""),
|
262
|
+
"author": fields.get("byline", "")
|
263
|
+
}
|
264
|
+
|
265
|
+
# Only include full content if not in snippet-only mode
|
266
|
+
if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
|
267
|
+
result["content"] = fields.get("body", "")
|
268
|
+
result["full_content"] = fields.get("body", "")
|
269
|
+
|
270
|
+
# Extract tags/keywords
|
271
|
+
tags = article.get("tags", [])
|
272
|
+
result["keywords"] = [tag.get("webTitle", "") for tag in tags if tag.get("type") == "keyword"]
|
273
|
+
|
274
|
+
return result
|
275
|
+
|
276
|
+
except Exception as e:
|
277
|
+
print(f"Error getting article details: {e}")
|
278
|
+
return {}
|
279
|
+
|
280
|
+
def search_by_section(self, section: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
281
|
+
"""
|
282
|
+
Search for articles in a specific section.
|
283
|
+
|
284
|
+
Args:
|
285
|
+
section: The Guardian section name (e.g., "politics", "technology")
|
286
|
+
max_results: Maximum number of search results (defaults to self.max_results)
|
287
|
+
|
288
|
+
Returns:
|
289
|
+
List of articles in the section
|
290
|
+
"""
|
291
|
+
original_section = self.section
|
292
|
+
original_max_results = self.max_results
|
293
|
+
|
294
|
+
try:
|
295
|
+
# Set section and max_results for this search
|
296
|
+
self.section = section
|
297
|
+
if max_results:
|
298
|
+
self.max_results = max_results
|
299
|
+
|
300
|
+
# Use empty query to get all articles in the section
|
301
|
+
return self.run("")
|
302
|
+
|
303
|
+
finally:
|
304
|
+
# Restore original values
|
305
|
+
self.section = original_section
|
306
|
+
self.max_results = original_max_results
|
307
|
+
|
308
|
+
def get_recent_articles(self, days: int = 7, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
309
|
+
"""
|
310
|
+
Get recent articles from The Guardian.
|
311
|
+
|
312
|
+
Args:
|
313
|
+
days: Number of days to look back
|
314
|
+
max_results: Maximum number of results (defaults to self.max_results)
|
315
|
+
|
316
|
+
Returns:
|
317
|
+
List of recent articles
|
318
|
+
"""
|
319
|
+
original_from_date = self.from_date
|
320
|
+
original_order_by = self.order_by
|
321
|
+
original_max_results = self.max_results
|
322
|
+
|
323
|
+
try:
|
324
|
+
# Set parameters for this search
|
325
|
+
self.from_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
|
326
|
+
self.order_by = "newest"
|
327
|
+
if max_results:
|
328
|
+
self.max_results = max_results
|
329
|
+
|
330
|
+
# Use empty query to get all recent articles
|
331
|
+
return self.run("")
|
332
|
+
|
333
|
+
finally:
|
334
|
+
# Restore original values
|
335
|
+
self.from_date = original_from_date
|
336
|
+
self.order_by = original_order_by
|
337
|
+
self.max_results = original_max_results
|