local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +23 -22
- local_deep_research/__main__.py +16 -0
- local_deep_research/advanced_search_system/__init__.py +7 -0
- local_deep_research/advanced_search_system/filters/__init__.py +8 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
- local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
- local_deep_research/advanced_search_system/findings/repository.py +452 -0
- local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
- local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
- local_deep_research/advanced_search_system/questions/__init__.py +1 -0
- local_deep_research/advanced_search_system/questions/base_question.py +64 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
- local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
- local_deep_research/advanced_search_system/tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
- local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
- local_deep_research/api/__init__.py +5 -5
- local_deep_research/api/research_functions.py +154 -160
- local_deep_research/app.py +8 -0
- local_deep_research/citation_handler.py +25 -16
- local_deep_research/{config.py → config/config_files.py} +102 -110
- local_deep_research/config/llm_config.py +472 -0
- local_deep_research/config/search_config.py +77 -0
- local_deep_research/defaults/__init__.py +10 -5
- local_deep_research/defaults/main.toml +2 -2
- local_deep_research/defaults/search_engines.toml +60 -34
- local_deep_research/main.py +121 -19
- local_deep_research/migrate_db.py +147 -0
- local_deep_research/report_generator.py +87 -45
- local_deep_research/search_system.py +153 -283
- local_deep_research/setup_data_dir.py +35 -0
- local_deep_research/test_migration.py +178 -0
- local_deep_research/utilities/__init__.py +0 -0
- local_deep_research/utilities/db_utils.py +49 -0
- local_deep_research/{utilties → utilities}/enums.py +2 -2
- local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
- local_deep_research/utilities/search_utilities.py +242 -0
- local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
- local_deep_research/web/__init__.py +0 -1
- local_deep_research/web/app.py +86 -1709
- local_deep_research/web/app_factory.py +289 -0
- local_deep_research/web/database/README.md +70 -0
- local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
- local_deep_research/web/database/migrations.py +447 -0
- local_deep_research/web/database/models.py +117 -0
- local_deep_research/web/database/schema_upgrade.py +107 -0
- local_deep_research/web/models/database.py +294 -0
- local_deep_research/web/models/settings.py +94 -0
- local_deep_research/web/routes/api_routes.py +559 -0
- local_deep_research/web/routes/history_routes.py +354 -0
- local_deep_research/web/routes/research_routes.py +715 -0
- local_deep_research/web/routes/settings_routes.py +1583 -0
- local_deep_research/web/services/research_service.py +947 -0
- local_deep_research/web/services/resource_service.py +149 -0
- local_deep_research/web/services/settings_manager.py +669 -0
- local_deep_research/web/services/settings_service.py +187 -0
- local_deep_research/web/services/socket_service.py +210 -0
- local_deep_research/web/static/css/custom_dropdown.css +277 -0
- local_deep_research/web/static/css/settings.css +1223 -0
- local_deep_research/web/static/css/styles.css +525 -48
- local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
- local_deep_research/web/static/js/components/detail.js +348 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
- local_deep_research/web/static/js/components/fallback/ui.js +215 -0
- local_deep_research/web/static/js/components/history.js +487 -0
- local_deep_research/web/static/js/components/logpanel.js +949 -0
- local_deep_research/web/static/js/components/progress.js +1107 -0
- local_deep_research/web/static/js/components/research.js +1865 -0
- local_deep_research/web/static/js/components/results.js +766 -0
- local_deep_research/web/static/js/components/settings.js +3981 -0
- local_deep_research/web/static/js/components/settings_sync.js +106 -0
- local_deep_research/web/static/js/main.js +226 -0
- local_deep_research/web/static/js/services/api.js +253 -0
- local_deep_research/web/static/js/services/audio.js +31 -0
- local_deep_research/web/static/js/services/formatting.js +119 -0
- local_deep_research/web/static/js/services/pdf.js +622 -0
- local_deep_research/web/static/js/services/socket.js +882 -0
- local_deep_research/web/static/js/services/ui.js +546 -0
- local_deep_research/web/templates/base.html +72 -0
- local_deep_research/web/templates/components/custom_dropdown.html +47 -0
- local_deep_research/web/templates/components/log_panel.html +32 -0
- local_deep_research/web/templates/components/mobile_nav.html +22 -0
- local_deep_research/web/templates/components/settings_form.html +299 -0
- local_deep_research/web/templates/components/sidebar.html +21 -0
- local_deep_research/web/templates/pages/details.html +73 -0
- local_deep_research/web/templates/pages/history.html +51 -0
- local_deep_research/web/templates/pages/progress.html +57 -0
- local_deep_research/web/templates/pages/research.html +139 -0
- local_deep_research/web/templates/pages/results.html +59 -0
- local_deep_research/web/templates/settings_dashboard.html +78 -192
- local_deep_research/web/utils/__init__.py +0 -0
- local_deep_research/web/utils/formatters.py +76 -0
- local_deep_research/web_search_engines/engines/full_search.py +18 -16
- local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
- local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
- local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
- local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
- local_deep_research/web_search_engines/search_engine_base.py +174 -99
- local_deep_research/web_search_engines/search_engine_factory.py +192 -102
- local_deep_research/web_search_engines/search_engines_config.py +22 -15
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
- local_deep_research-0.2.2.dist-info/RECORD +135 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
- local_deep_research/defaults/llm_config.py +0 -338
- local_deep_research/utilties/search_utilities.py +0 -114
- local_deep_research/web/static/js/app.js +0 -3763
- local_deep_research/web/templates/api_keys_config.html +0 -82
- local_deep_research/web/templates/collections_config.html +0 -90
- local_deep_research/web/templates/index.html +0 -348
- local_deep_research/web/templates/llm_config.html +0 -120
- local_deep_research/web/templates/main_config.html +0 -89
- local_deep_research/web/templates/search_engines_config.html +0 -154
- local_deep_research/web/templates/settings.html +0 -519
- local_deep_research-0.1.26.dist-info/RECORD +0 -61
- local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
- /local_deep_research/{utilties → config}/__init__.py +0 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,35 +1,39 @@
|
|
1
|
-
import requests
|
2
1
|
import logging
|
3
|
-
from typing import Dict, List, Any, Optional, Tuple
|
4
2
|
import os
|
5
3
|
from datetime import datetime, timedelta
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
5
|
+
|
6
|
+
import requests
|
6
7
|
from langchain_core.language_models import BaseLLM
|
7
8
|
|
8
|
-
from
|
9
|
-
from
|
10
|
-
from
|
9
|
+
from ...config import search_config
|
10
|
+
from ...utilities.search_utilities import remove_think_tags
|
11
|
+
from ..search_engine_base import BaseSearchEngine
|
11
12
|
|
12
13
|
# Setup logging
|
13
14
|
logging.basicConfig(level=logging.INFO)
|
14
15
|
logger = logging.getLogger(__name__)
|
15
16
|
|
17
|
+
|
16
18
|
class GuardianSearchEngine(BaseSearchEngine):
|
17
19
|
"""Enhanced Guardian API search engine implementation with LLM query optimization"""
|
18
|
-
|
19
|
-
def __init__(
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
20
|
+
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
max_results: int = 10,
|
24
|
+
api_key: Optional[str] = None,
|
25
|
+
from_date: Optional[str] = None,
|
26
|
+
to_date: Optional[str] = None,
|
27
|
+
section: Optional[str] = None,
|
28
|
+
order_by: str = "relevance",
|
29
|
+
llm: Optional[BaseLLM] = None,
|
30
|
+
max_filtered_results: Optional[int] = None,
|
31
|
+
optimize_queries: bool = True,
|
32
|
+
adaptive_search: bool = True,
|
33
|
+
):
|
30
34
|
"""
|
31
35
|
Initialize The Guardian search engine with enhanced features.
|
32
|
-
|
36
|
+
|
33
37
|
Args:
|
34
38
|
max_results: Maximum number of search results
|
35
39
|
api_key: The Guardian API key (can also be set in GUARDIAN_API_KEY env)
|
@@ -43,14 +47,18 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
43
47
|
adaptive_search: Whether to use adaptive search (adjusting date ranges)
|
44
48
|
"""
|
45
49
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
46
|
-
super().__init__(
|
50
|
+
super().__init__(
|
51
|
+
llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
|
52
|
+
)
|
47
53
|
self.api_key = api_key or os.getenv("GUARDIAN_API_KEY")
|
48
54
|
self.optimize_queries = optimize_queries
|
49
55
|
self.adaptive_search = adaptive_search
|
50
|
-
|
56
|
+
|
51
57
|
if not self.api_key:
|
52
|
-
raise ValueError(
|
53
|
-
|
58
|
+
raise ValueError(
|
59
|
+
"Guardian API key not found. Please provide api_key or set the GUARDIAN_API_KEY environment variable."
|
60
|
+
)
|
61
|
+
|
54
62
|
# Set date ranges if not provided
|
55
63
|
if not from_date:
|
56
64
|
# Default to one month ago
|
@@ -58,44 +66,46 @@ class GuardianSearchEngine(BaseSearchEngine):
|
|
58
66
|
self.from_date = one_month_ago.strftime("%Y-%m-%d")
|
59
67
|
else:
|
60
68
|
self.from_date = from_date
|
61
|
-
|
69
|
+
|
62
70
|
if not to_date:
|
63
71
|
# Default to today
|
64
72
|
self.to_date = datetime.now().strftime("%Y-%m-%d")
|
65
73
|
else:
|
66
74
|
self.to_date = to_date
|
67
|
-
|
75
|
+
|
68
76
|
self.section = section
|
69
77
|
self.order_by = order_by
|
70
78
|
self._original_date_params = {
|
71
79
|
"from_date": self.from_date,
|
72
|
-
"to_date": self.to_date
|
80
|
+
"to_date": self.to_date,
|
73
81
|
}
|
74
|
-
|
82
|
+
|
75
83
|
# API base URL
|
76
84
|
self.api_url = "https://content.guardianapis.com/search"
|
77
|
-
|
85
|
+
|
78
86
|
def _optimize_query_for_guardian(self, query: str) -> str:
|
79
87
|
"""
|
80
88
|
Optimize a natural language query for Guardian search.
|
81
89
|
Uses LLM to transform questions into effective news search queries.
|
82
|
-
|
90
|
+
|
83
91
|
Args:
|
84
92
|
query: Natural language query
|
85
|
-
|
93
|
+
|
86
94
|
Returns:
|
87
95
|
Optimized query string for Guardian
|
88
96
|
"""
|
89
97
|
# Handle extremely long queries by truncating first
|
90
98
|
if len(query) > 150:
|
91
99
|
simple_query = " ".join(query.split()[:10])
|
92
|
-
logger.info(
|
100
|
+
logger.info(
|
101
|
+
f"Query too long ({len(query)} chars), truncating to: {simple_query}"
|
102
|
+
)
|
93
103
|
query = simple_query
|
94
|
-
|
104
|
+
|
95
105
|
if not self.llm or not self.optimize_queries:
|
96
106
|
# Return original query if no LLM available or optimization disabled
|
97
107
|
return query
|
98
|
-
|
108
|
+
|
99
109
|
try:
|
100
110
|
# Prompt for query optimization
|
101
111
|
prompt = f"""Transform this natural language question into a very short Guardian news search query.
|
@@ -119,36 +129,42 @@ EXAMPLE CONVERSIONS:
|
|
119
129
|
|
120
130
|
Return ONLY the extremely brief search query.
|
121
131
|
"""
|
122
|
-
|
132
|
+
|
123
133
|
# Get response from LLM
|
124
134
|
response = self.llm.invoke(prompt)
|
125
135
|
optimized_query = remove_think_tags(response.content).strip()
|
126
|
-
|
136
|
+
|
127
137
|
# Clean up the query - remove any explanations
|
128
|
-
lines = optimized_query.split(
|
138
|
+
lines = optimized_query.split("\n")
|
129
139
|
for line in lines:
|
130
140
|
line = line.strip()
|
131
|
-
if line and not line.lower().startswith(
|
141
|
+
if line and not line.lower().startswith(
|
142
|
+
("here", "i would", "the best", "this query")
|
143
|
+
):
|
132
144
|
optimized_query = line
|
133
145
|
break
|
134
|
-
|
146
|
+
|
135
147
|
# Remove any quotes that wrap the entire query
|
136
|
-
if
|
148
|
+
if (
|
149
|
+
optimized_query.startswith('"')
|
150
|
+
and optimized_query.endswith('"')
|
151
|
+
and optimized_query.count('"') == 2
|
152
|
+
):
|
137
153
|
optimized_query = optimized_query[1:-1]
|
138
|
-
|
154
|
+
|
139
155
|
logger.info(f"Original query: '{query}'")
|
140
156
|
logger.info(f"Optimized for Guardian: '{optimized_query}'")
|
141
|
-
|
157
|
+
|
142
158
|
return optimized_query
|
143
|
-
|
159
|
+
|
144
160
|
except Exception as e:
|
145
161
|
logger.error(f"Error optimizing query: {e}")
|
146
162
|
return query # Fall back to original query on error
|
147
|
-
|
163
|
+
|
148
164
|
def _adapt_dates_for_query_type(self, query: str) -> None:
|
149
165
|
"""
|
150
166
|
Adapt date range based on query type (historical vs current).
|
151
|
-
|
167
|
+
|
152
168
|
Args:
|
153
169
|
query: The search query
|
154
170
|
"""
|
@@ -160,10 +176,10 @@ Return ONLY the extremely brief search query.
|
|
160
176
|
self.from_date = recent
|
161
177
|
self.order_by = "newest"
|
162
178
|
return
|
163
|
-
|
179
|
+
|
164
180
|
if not self.llm or not self.adaptive_search:
|
165
181
|
return
|
166
|
-
|
182
|
+
|
167
183
|
try:
|
168
184
|
prompt = f"""Is this query asking about HISTORICAL events or CURRENT events?
|
169
185
|
|
@@ -175,103 +191,111 @@ ONE WORD ANSWER ONLY:
|
|
175
191
|
- "UNCLEAR" if can't determine
|
176
192
|
|
177
193
|
ONE WORD ONLY:"""
|
178
|
-
|
194
|
+
|
179
195
|
response = self.llm.invoke(prompt)
|
180
196
|
answer = remove_think_tags(response.content).strip().upper()
|
181
|
-
|
197
|
+
|
182
198
|
# Reset to original parameters first
|
183
199
|
self.from_date = self._original_date_params["from_date"]
|
184
200
|
self.to_date = self._original_date_params["to_date"]
|
185
|
-
|
201
|
+
|
186
202
|
if "HISTORICAL" in answer:
|
187
203
|
# For historical queries, go back 10 years
|
188
|
-
logger.info(
|
189
|
-
|
204
|
+
logger.info(
|
205
|
+
"Query classified as HISTORICAL - extending search timeframe"
|
206
|
+
)
|
207
|
+
ten_years_ago = (datetime.now() - timedelta(days=3650)).strftime(
|
208
|
+
"%Y-%m-%d"
|
209
|
+
)
|
190
210
|
self.from_date = ten_years_ago
|
191
|
-
|
211
|
+
|
192
212
|
elif "CURRENT" in answer:
|
193
213
|
# For current events, focus on recent content
|
194
214
|
logger.info("Query classified as CURRENT - focusing on recent content")
|
195
215
|
recent = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
|
196
216
|
self.from_date = recent
|
197
217
|
self.order_by = "newest" # Prioritize newest for current events
|
198
|
-
|
218
|
+
|
199
219
|
except Exception as e:
|
200
220
|
logger.error(f"Error adapting dates for query type: {e}")
|
201
221
|
# Keep original date parameters on error
|
202
|
-
|
222
|
+
|
203
223
|
def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
|
204
224
|
"""
|
205
225
|
Perform adaptive search that progressively adjusts parameters based on results.
|
206
|
-
|
226
|
+
|
207
227
|
Args:
|
208
228
|
query: The search query
|
209
|
-
|
229
|
+
|
210
230
|
Returns:
|
211
231
|
Tuple of (list of articles, search strategy used)
|
212
232
|
"""
|
213
233
|
# Try with current parameters
|
214
234
|
articles = self._get_all_data(query)
|
215
235
|
strategy = "initial"
|
216
|
-
|
236
|
+
|
217
237
|
# If no results or too few, try different strategies
|
218
238
|
if len(articles) < 3 and self.adaptive_search:
|
219
|
-
logger.info(
|
220
|
-
|
239
|
+
logger.info(
|
240
|
+
f"Initial search found only {len(articles)} results, trying alternative strategies"
|
241
|
+
)
|
242
|
+
|
221
243
|
# Try with expanded date range
|
222
244
|
original_from_date = self.from_date
|
223
245
|
original_order_by = self.order_by
|
224
|
-
|
246
|
+
|
225
247
|
# Strategy 1: Expand to 6 months
|
226
248
|
logger.info("Strategy 1: Expanding time range to 6 months")
|
227
249
|
six_months_ago = (datetime.now() - timedelta(days=180)).strftime("%Y-%m-%d")
|
228
250
|
self.from_date = six_months_ago
|
229
|
-
|
251
|
+
|
230
252
|
articles1 = self._get_all_data(query)
|
231
253
|
if len(articles1) > len(articles):
|
232
254
|
articles = articles1
|
233
255
|
strategy = "expanded_6mo"
|
234
|
-
|
256
|
+
|
235
257
|
# Strategy 2: Expand to all time and try relevance order
|
236
258
|
if len(articles) < 3:
|
237
259
|
logger.info("Strategy 2: Expanding to all time with relevance ordering")
|
238
260
|
self.from_date = "2000-01-01" # Effectively "all time"
|
239
261
|
self.order_by = "relevance"
|
240
|
-
|
262
|
+
|
241
263
|
articles2 = self._get_all_data(query)
|
242
264
|
if len(articles2) > len(articles):
|
243
265
|
articles = articles2
|
244
266
|
strategy = "all_time_relevance"
|
245
|
-
|
267
|
+
|
246
268
|
# Strategy 3: Try removing section constraints
|
247
269
|
if len(articles) < 3 and self.section:
|
248
270
|
logger.info("Strategy 3: Removing section constraint")
|
249
271
|
original_section = self.section
|
250
272
|
self.section = None
|
251
|
-
|
273
|
+
|
252
274
|
articles3 = self._get_all_data(query)
|
253
275
|
if len(articles3) > len(articles):
|
254
276
|
articles = articles3
|
255
277
|
strategy = "no_section"
|
256
|
-
|
278
|
+
|
257
279
|
# Restore section setting
|
258
280
|
self.section = original_section
|
259
|
-
|
281
|
+
|
260
282
|
# Restore original settings
|
261
283
|
self.from_date = original_from_date
|
262
284
|
self.order_by = original_order_by
|
263
|
-
|
264
|
-
logger.info(
|
285
|
+
|
286
|
+
logger.info(
|
287
|
+
f"Adaptive search using strategy '{strategy}' found {len(articles)} results"
|
288
|
+
)
|
265
289
|
return articles, strategy
|
266
|
-
|
290
|
+
|
267
291
|
def _get_all_data(self, query: str) -> List[Dict[str, Any]]:
|
268
292
|
"""
|
269
293
|
Get all article data from The Guardian API in a single call.
|
270
294
|
Always requests all fields for simplicity.
|
271
|
-
|
295
|
+
|
272
296
|
Args:
|
273
297
|
query: The search query
|
274
|
-
|
298
|
+
|
275
299
|
Returns:
|
276
300
|
List of articles with all data
|
277
301
|
"""
|
@@ -280,20 +304,24 @@ ONE WORD ONLY:"""
|
|
280
304
|
if not query or query.strip() == "":
|
281
305
|
query = "news"
|
282
306
|
logger.warning("Empty query provided, using 'news' as default")
|
283
|
-
|
307
|
+
|
284
308
|
# Ensure query is not too long for API
|
285
309
|
if len(query) > 100:
|
286
|
-
logger.warning(
|
310
|
+
logger.warning(
|
311
|
+
f"Query too long for Guardian API ({len(query)} chars), truncating"
|
312
|
+
)
|
287
313
|
query = query[:100]
|
288
|
-
|
314
|
+
|
289
315
|
# Always request all fields for simplicity
|
290
316
|
# Ensure max_results is an integer to avoid comparison errors
|
291
|
-
page_size = min(
|
292
|
-
|
317
|
+
page_size = min(
|
318
|
+
int(self.max_results) if self.max_results is not None else 10, 50
|
319
|
+
)
|
320
|
+
|
293
321
|
# Log full parameters for debugging
|
294
322
|
logger.info(f"Guardian API search query: '{query}'")
|
295
323
|
logger.info(f"Guardian API date range: {self.from_date} to {self.to_date}")
|
296
|
-
|
324
|
+
|
297
325
|
params = {
|
298
326
|
"q": query,
|
299
327
|
"api-key": self.api_key,
|
@@ -302,36 +330,36 @@ ONE WORD ONLY:"""
|
|
302
330
|
"order-by": self.order_by,
|
303
331
|
"page-size": page_size, # API maximum is 50
|
304
332
|
"show-fields": "headline,trailText,byline,body,publication",
|
305
|
-
"show-tags": "keyword"
|
333
|
+
"show-tags": "keyword",
|
306
334
|
}
|
307
|
-
|
335
|
+
|
308
336
|
# Add section filter if specified
|
309
337
|
if self.section:
|
310
338
|
params["section"] = self.section
|
311
|
-
|
339
|
+
|
312
340
|
# Log the complete request parameters (except API key)
|
313
341
|
log_params = params.copy()
|
314
342
|
log_params["api-key"] = "REDACTED"
|
315
343
|
logger.info(f"Guardian API request parameters: {log_params}")
|
316
|
-
|
344
|
+
|
317
345
|
# Execute the API request
|
318
346
|
response = requests.get(self.api_url, params=params)
|
319
347
|
response.raise_for_status()
|
320
|
-
|
348
|
+
|
321
349
|
data = response.json()
|
322
|
-
|
350
|
+
|
323
351
|
# Extract results from the response
|
324
352
|
articles = data.get("response", {}).get("results", [])
|
325
353
|
logger.info(f"Guardian API returned {len(articles)} articles")
|
326
|
-
|
354
|
+
|
327
355
|
# Format results to include all data
|
328
356
|
formatted_articles = []
|
329
357
|
for i, article in enumerate(articles):
|
330
358
|
if i >= self.max_results:
|
331
359
|
break
|
332
|
-
|
360
|
+
|
333
361
|
fields = article.get("fields", {})
|
334
|
-
|
362
|
+
|
335
363
|
# Format the article with all fields
|
336
364
|
result = {
|
337
365
|
"id": article.get("id", ""),
|
@@ -342,42 +370,46 @@ ONE WORD ONLY:"""
|
|
342
370
|
"section": article.get("sectionName", ""),
|
343
371
|
"author": fields.get("byline", ""),
|
344
372
|
"content": fields.get("body", ""),
|
345
|
-
"full_content": fields.get("body", "")
|
373
|
+
"full_content": fields.get("body", ""),
|
346
374
|
}
|
347
|
-
|
375
|
+
|
348
376
|
# Extract tags/keywords
|
349
377
|
tags = article.get("tags", [])
|
350
|
-
result["keywords"] = [
|
351
|
-
|
378
|
+
result["keywords"] = [
|
379
|
+
tag.get("webTitle", "")
|
380
|
+
for tag in tags
|
381
|
+
if tag.get("type") == "keyword"
|
382
|
+
]
|
383
|
+
|
352
384
|
formatted_articles.append(result)
|
353
|
-
|
385
|
+
|
354
386
|
return formatted_articles
|
355
|
-
|
387
|
+
|
356
388
|
except Exception as e:
|
357
389
|
logger.error(f"Error getting data from The Guardian API: {e}")
|
358
390
|
return []
|
359
|
-
|
391
|
+
|
360
392
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
361
393
|
"""
|
362
394
|
Get preview information for Guardian articles with enhanced optimization.
|
363
|
-
|
395
|
+
|
364
396
|
Args:
|
365
397
|
query: The search query
|
366
|
-
|
398
|
+
|
367
399
|
Returns:
|
368
400
|
List of preview dictionaries
|
369
401
|
"""
|
370
402
|
logger.info(f"Getting articles from The Guardian API for query: {query}")
|
371
|
-
|
403
|
+
|
372
404
|
# Step 1: Optimize the query using LLM
|
373
405
|
optimized_query = self._optimize_query_for_guardian(query)
|
374
|
-
|
406
|
+
|
375
407
|
# Step 2: Adapt date parameters based on query type
|
376
408
|
self._adapt_dates_for_query_type(optimized_query)
|
377
|
-
|
409
|
+
|
378
410
|
# Step 3: Perform adaptive search
|
379
411
|
articles, strategy = self._adaptive_search(optimized_query)
|
380
|
-
|
412
|
+
|
381
413
|
# Store search metadata for debugging
|
382
414
|
self._search_metadata = {
|
383
415
|
"original_query": query,
|
@@ -386,12 +418,12 @@ ONE WORD ONLY:"""
|
|
386
418
|
"from_date": self.from_date,
|
387
419
|
"to_date": self.to_date,
|
388
420
|
"section": self.section,
|
389
|
-
"order_by": self.order_by
|
421
|
+
"order_by": self.order_by,
|
390
422
|
}
|
391
|
-
|
423
|
+
|
392
424
|
# Store full articles for later use
|
393
425
|
self._full_articles = {a["id"]: a for a in articles}
|
394
|
-
|
426
|
+
|
395
427
|
# Return only preview fields for each article
|
396
428
|
previews = []
|
397
429
|
for article in articles:
|
@@ -403,70 +435,79 @@ ONE WORD ONLY:"""
|
|
403
435
|
"publication_date": article["publication_date"],
|
404
436
|
"section": article["section"],
|
405
437
|
"author": article["author"],
|
406
|
-
"keywords": article.get("keywords", [])
|
438
|
+
"keywords": article.get("keywords", []),
|
407
439
|
}
|
408
440
|
previews.append(preview)
|
409
|
-
|
441
|
+
|
410
442
|
return previews
|
411
|
-
|
412
|
-
def _get_full_content(
|
443
|
+
|
444
|
+
def _get_full_content(
|
445
|
+
self, relevant_items: List[Dict[str, Any]]
|
446
|
+
) -> List[Dict[str, Any]]:
|
413
447
|
"""
|
414
448
|
Get full content for the relevant Guardian articles.
|
415
449
|
Restores full content from the cached data.
|
416
|
-
|
450
|
+
|
417
451
|
Args:
|
418
452
|
relevant_items: List of relevant preview dictionaries
|
419
|
-
|
453
|
+
|
420
454
|
Returns:
|
421
455
|
List of result dictionaries with full content
|
422
456
|
"""
|
423
|
-
logger.info(
|
424
|
-
|
457
|
+
logger.info(
|
458
|
+
f"Adding full content to {len(relevant_items)} relevant Guardian articles"
|
459
|
+
)
|
460
|
+
|
425
461
|
# Check if we should add full content
|
426
|
-
if
|
462
|
+
if (
|
463
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
464
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
465
|
+
):
|
427
466
|
return relevant_items
|
428
|
-
|
467
|
+
|
429
468
|
# Get full articles for relevant items
|
430
469
|
results = []
|
431
470
|
for item in relevant_items:
|
432
471
|
article_id = item.get("id", "")
|
433
|
-
|
472
|
+
|
434
473
|
# Get the full article from our cache
|
435
|
-
if hasattr(self,
|
474
|
+
if hasattr(self, "_full_articles") and article_id in self._full_articles:
|
436
475
|
results.append(self._full_articles[article_id])
|
437
476
|
else:
|
438
477
|
# If not found (shouldn't happen), just use the preview
|
439
478
|
results.append(item)
|
440
|
-
|
479
|
+
|
441
480
|
return results
|
442
|
-
|
481
|
+
|
443
482
|
def run(self, query: str) -> List[Dict[str, Any]]:
|
444
483
|
"""
|
445
484
|
Execute a search using The Guardian API with the enhanced approach.
|
446
|
-
|
485
|
+
|
447
486
|
Args:
|
448
487
|
query: The search query
|
449
|
-
|
488
|
+
|
450
489
|
Returns:
|
451
490
|
List of search results
|
452
491
|
"""
|
453
|
-
logger.info(
|
454
|
-
|
492
|
+
logger.info("---Execute a search using The Guardian (enhanced)---")
|
493
|
+
|
455
494
|
# Additional safety check for None query
|
456
495
|
if query is None:
|
457
496
|
logger.error("None query passed to Guardian search engine")
|
458
497
|
query = "news"
|
459
|
-
|
498
|
+
|
460
499
|
try:
|
461
500
|
# Get previews with our enhanced method
|
462
501
|
previews = self._get_previews(query)
|
463
|
-
|
502
|
+
|
464
503
|
# If no results, try one more time with a simplified query
|
465
504
|
if not previews:
|
466
505
|
simple_query = " ".join([w for w in query.split() if len(w) > 3][:3])
|
467
|
-
logger.warning(
|
506
|
+
logger.warning(
|
507
|
+
f"No Guardian articles found, trying simplified query: {simple_query}"
|
508
|
+
)
|
468
509
|
previews = self._get_previews(simple_query)
|
469
|
-
|
510
|
+
|
470
511
|
# If still no results, try with a very generic query as last resort
|
471
512
|
if not previews and "trump" in query.lower():
|
472
513
|
logger.warning("Trying last resort query: 'Donald Trump'")
|
@@ -474,109 +515,121 @@ ONE WORD ONLY:"""
|
|
474
515
|
elif not previews:
|
475
516
|
logger.warning("Trying last resort query: 'news'")
|
476
517
|
previews = self._get_previews("news")
|
477
|
-
|
518
|
+
|
478
519
|
# If still no results after all attempts, return empty list
|
479
520
|
if not previews:
|
480
|
-
logger.warning(
|
521
|
+
logger.warning("No Guardian articles found after multiple attempts")
|
481
522
|
return []
|
482
|
-
|
523
|
+
|
483
524
|
# Filter for relevance if we have an LLM
|
484
|
-
if
|
525
|
+
if (
|
526
|
+
self.llm
|
527
|
+
and hasattr(self, "max_filtered_results")
|
528
|
+
and self.max_filtered_results
|
529
|
+
):
|
485
530
|
filtered_items = self._filter_for_relevance(previews, query)
|
486
531
|
if not filtered_items:
|
487
532
|
# Fall back to unfiltered results if everything was filtered out
|
488
|
-
logger.warning(
|
489
|
-
|
533
|
+
logger.warning(
|
534
|
+
"All articles filtered out, using unfiltered results"
|
535
|
+
)
|
536
|
+
filtered_items = previews[: self.max_filtered_results]
|
490
537
|
else:
|
491
538
|
filtered_items = previews
|
492
|
-
|
539
|
+
|
493
540
|
# Get full content for relevant items
|
494
541
|
results = self._get_full_content(filtered_items)
|
495
|
-
|
542
|
+
|
496
543
|
# Add source information to make it clear these are from The Guardian
|
497
544
|
for result in results:
|
498
545
|
if "source" not in result:
|
499
546
|
result["source"] = "The Guardian"
|
500
|
-
|
547
|
+
|
501
548
|
# Clean up the cache after use
|
502
|
-
if hasattr(self,
|
549
|
+
if hasattr(self, "_full_articles"):
|
503
550
|
del self._full_articles
|
504
|
-
|
551
|
+
|
505
552
|
# Restore original date parameters
|
506
553
|
self.from_date = self._original_date_params["from_date"]
|
507
554
|
self.to_date = self._original_date_params["to_date"]
|
508
|
-
|
555
|
+
|
509
556
|
# Log search metadata if available
|
510
|
-
if hasattr(self,
|
557
|
+
if hasattr(self, "_search_metadata"):
|
511
558
|
logger.info(f"Search metadata: {self._search_metadata}")
|
512
559
|
del self._search_metadata
|
513
|
-
|
560
|
+
|
514
561
|
return results
|
515
|
-
|
562
|
+
|
516
563
|
except Exception as e:
|
517
564
|
logger.error(f"Error in Guardian search: {e}")
|
518
|
-
|
565
|
+
|
519
566
|
# Restore original date parameters on error
|
520
567
|
self.from_date = self._original_date_params["from_date"]
|
521
568
|
self.to_date = self._original_date_params["to_date"]
|
522
|
-
|
569
|
+
|
523
570
|
return []
|
524
|
-
|
525
|
-
def search_by_section(
|
571
|
+
|
572
|
+
def search_by_section(
|
573
|
+
self, section: str, max_results: Optional[int] = None
|
574
|
+
) -> List[Dict[str, Any]]:
|
526
575
|
"""
|
527
576
|
Search for articles in a specific section.
|
528
|
-
|
577
|
+
|
529
578
|
Args:
|
530
579
|
section: The Guardian section name (e.g., "politics", "technology")
|
531
580
|
max_results: Maximum number of results (defaults to self.max_results)
|
532
|
-
|
581
|
+
|
533
582
|
Returns:
|
534
583
|
List of articles in the section
|
535
584
|
"""
|
536
585
|
original_section = self.section
|
537
586
|
original_max_results = self.max_results
|
538
|
-
|
587
|
+
|
539
588
|
try:
|
540
589
|
# Set section and max_results for this search
|
541
590
|
self.section = section
|
542
591
|
if max_results:
|
543
592
|
self.max_results = max_results
|
544
|
-
|
593
|
+
|
545
594
|
# Use empty query to get all articles in the section
|
546
595
|
return self.run("")
|
547
|
-
|
596
|
+
|
548
597
|
finally:
|
549
598
|
# Restore original values
|
550
599
|
self.section = original_section
|
551
600
|
self.max_results = original_max_results
|
552
|
-
|
553
|
-
def get_recent_articles(
|
601
|
+
|
602
|
+
def get_recent_articles(
|
603
|
+
self, days: int = 7, max_results: Optional[int] = None
|
604
|
+
) -> List[Dict[str, Any]]:
|
554
605
|
"""
|
555
606
|
Get recent articles from The Guardian.
|
556
|
-
|
607
|
+
|
557
608
|
Args:
|
558
609
|
days: Number of days to look back
|
559
610
|
max_results: Maximum number of results (defaults to self.max_results)
|
560
|
-
|
611
|
+
|
561
612
|
Returns:
|
562
613
|
List of recent articles
|
563
614
|
"""
|
564
615
|
original_from_date = self.from_date
|
565
616
|
original_order_by = self.order_by
|
566
617
|
original_max_results = self.max_results
|
567
|
-
|
618
|
+
|
568
619
|
try:
|
569
620
|
# Set parameters for this search
|
570
|
-
self.from_date = (datetime.now() - timedelta(days=days)).strftime(
|
621
|
+
self.from_date = (datetime.now() - timedelta(days=days)).strftime(
|
622
|
+
"%Y-%m-%d"
|
623
|
+
)
|
571
624
|
self.order_by = "newest"
|
572
625
|
if max_results:
|
573
626
|
self.max_results = max_results
|
574
|
-
|
627
|
+
|
575
628
|
# Use empty query to get all recent articles
|
576
629
|
return self.run("")
|
577
|
-
|
630
|
+
|
578
631
|
finally:
|
579
632
|
# Restore original values
|
580
633
|
self.from_date = original_from_date
|
581
634
|
self.order_by = original_order_by
|
582
|
-
self.max_results = original_max_results
|
635
|
+
self.max_results = original_max_results
|