local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. local_deep_research/__init__.py +23 -22
  2. local_deep_research/__main__.py +16 -0
  3. local_deep_research/advanced_search_system/__init__.py +7 -0
  4. local_deep_research/advanced_search_system/filters/__init__.py +8 -0
  5. local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
  6. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
  7. local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
  8. local_deep_research/advanced_search_system/findings/repository.py +452 -0
  9. local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
  10. local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
  11. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
  12. local_deep_research/advanced_search_system/questions/__init__.py +1 -0
  13. local_deep_research/advanced_search_system/questions/base_question.py +64 -0
  14. local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
  15. local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
  16. local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
  17. local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
  18. local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
  19. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
  20. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
  21. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
  22. local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
  23. local_deep_research/advanced_search_system/tools/__init__.py +1 -0
  24. local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
  25. local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
  26. local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
  27. local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
  28. local_deep_research/api/__init__.py +5 -5
  29. local_deep_research/api/research_functions.py +154 -160
  30. local_deep_research/app.py +8 -0
  31. local_deep_research/citation_handler.py +25 -16
  32. local_deep_research/{config.py → config/config_files.py} +102 -110
  33. local_deep_research/config/llm_config.py +472 -0
  34. local_deep_research/config/search_config.py +77 -0
  35. local_deep_research/defaults/__init__.py +10 -5
  36. local_deep_research/defaults/main.toml +2 -2
  37. local_deep_research/defaults/search_engines.toml +60 -34
  38. local_deep_research/main.py +121 -19
  39. local_deep_research/migrate_db.py +147 -0
  40. local_deep_research/report_generator.py +87 -45
  41. local_deep_research/search_system.py +153 -283
  42. local_deep_research/setup_data_dir.py +35 -0
  43. local_deep_research/test_migration.py +178 -0
  44. local_deep_research/utilities/__init__.py +0 -0
  45. local_deep_research/utilities/db_utils.py +49 -0
  46. local_deep_research/{utilties → utilities}/enums.py +2 -2
  47. local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
  48. local_deep_research/utilities/search_utilities.py +242 -0
  49. local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
  50. local_deep_research/web/__init__.py +0 -1
  51. local_deep_research/web/app.py +86 -1709
  52. local_deep_research/web/app_factory.py +289 -0
  53. local_deep_research/web/database/README.md +70 -0
  54. local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
  55. local_deep_research/web/database/migrations.py +447 -0
  56. local_deep_research/web/database/models.py +117 -0
  57. local_deep_research/web/database/schema_upgrade.py +107 -0
  58. local_deep_research/web/models/database.py +294 -0
  59. local_deep_research/web/models/settings.py +94 -0
  60. local_deep_research/web/routes/api_routes.py +559 -0
  61. local_deep_research/web/routes/history_routes.py +354 -0
  62. local_deep_research/web/routes/research_routes.py +715 -0
  63. local_deep_research/web/routes/settings_routes.py +1583 -0
  64. local_deep_research/web/services/research_service.py +947 -0
  65. local_deep_research/web/services/resource_service.py +149 -0
  66. local_deep_research/web/services/settings_manager.py +669 -0
  67. local_deep_research/web/services/settings_service.py +187 -0
  68. local_deep_research/web/services/socket_service.py +210 -0
  69. local_deep_research/web/static/css/custom_dropdown.css +277 -0
  70. local_deep_research/web/static/css/settings.css +1223 -0
  71. local_deep_research/web/static/css/styles.css +525 -48
  72. local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
  73. local_deep_research/web/static/js/components/detail.js +348 -0
  74. local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
  75. local_deep_research/web/static/js/components/fallback/ui.js +215 -0
  76. local_deep_research/web/static/js/components/history.js +487 -0
  77. local_deep_research/web/static/js/components/logpanel.js +949 -0
  78. local_deep_research/web/static/js/components/progress.js +1107 -0
  79. local_deep_research/web/static/js/components/research.js +1865 -0
  80. local_deep_research/web/static/js/components/results.js +766 -0
  81. local_deep_research/web/static/js/components/settings.js +3981 -0
  82. local_deep_research/web/static/js/components/settings_sync.js +106 -0
  83. local_deep_research/web/static/js/main.js +226 -0
  84. local_deep_research/web/static/js/services/api.js +253 -0
  85. local_deep_research/web/static/js/services/audio.js +31 -0
  86. local_deep_research/web/static/js/services/formatting.js +119 -0
  87. local_deep_research/web/static/js/services/pdf.js +622 -0
  88. local_deep_research/web/static/js/services/socket.js +882 -0
  89. local_deep_research/web/static/js/services/ui.js +546 -0
  90. local_deep_research/web/templates/base.html +72 -0
  91. local_deep_research/web/templates/components/custom_dropdown.html +47 -0
  92. local_deep_research/web/templates/components/log_panel.html +32 -0
  93. local_deep_research/web/templates/components/mobile_nav.html +22 -0
  94. local_deep_research/web/templates/components/settings_form.html +299 -0
  95. local_deep_research/web/templates/components/sidebar.html +21 -0
  96. local_deep_research/web/templates/pages/details.html +73 -0
  97. local_deep_research/web/templates/pages/history.html +51 -0
  98. local_deep_research/web/templates/pages/progress.html +57 -0
  99. local_deep_research/web/templates/pages/research.html +139 -0
  100. local_deep_research/web/templates/pages/results.html +59 -0
  101. local_deep_research/web/templates/settings_dashboard.html +78 -192
  102. local_deep_research/web/utils/__init__.py +0 -0
  103. local_deep_research/web/utils/formatters.py +76 -0
  104. local_deep_research/web_search_engines/engines/full_search.py +18 -16
  105. local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
  106. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
  107. local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
  108. local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
  109. local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
  110. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
  111. local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
  112. local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
  113. local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
  114. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
  115. local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
  116. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
  117. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
  118. local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
  119. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
  120. local_deep_research/web_search_engines/search_engine_base.py +174 -99
  121. local_deep_research/web_search_engines/search_engine_factory.py +192 -102
  122. local_deep_research/web_search_engines/search_engines_config.py +22 -15
  123. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
  124. local_deep_research-0.2.2.dist-info/RECORD +135 -0
  125. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
  126. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
  127. local_deep_research/defaults/llm_config.py +0 -338
  128. local_deep_research/utilties/search_utilities.py +0 -114
  129. local_deep_research/web/static/js/app.js +0 -3763
  130. local_deep_research/web/templates/api_keys_config.html +0 -82
  131. local_deep_research/web/templates/collections_config.html +0 -90
  132. local_deep_research/web/templates/index.html +0 -348
  133. local_deep_research/web/templates/llm_config.html +0 -120
  134. local_deep_research/web/templates/main_config.html +0 -89
  135. local_deep_research/web/templates/search_engines_config.html +0 -154
  136. local_deep_research/web/templates/settings.html +0 -519
  137. local_deep_research-0.1.26.dist-info/RECORD +0 -61
  138. local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
  139. /local_deep_research/{utilties → config}/__init__.py +0 -0
  140. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,35 +1,39 @@
1
- import requests
2
1
  import logging
3
- from typing import Dict, List, Any, Optional, Tuple
4
2
  import os
5
3
  from datetime import datetime, timedelta
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ import requests
6
7
  from langchain_core.language_models import BaseLLM
7
8
 
8
- from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
9
- from local_deep_research import config
10
- from local_deep_research.utilties.search_utilities import remove_think_tags
9
+ from ...config import search_config
10
+ from ...utilities.search_utilities import remove_think_tags
11
+ from ..search_engine_base import BaseSearchEngine
11
12
 
12
13
  # Setup logging
13
14
  logging.basicConfig(level=logging.INFO)
14
15
  logger = logging.getLogger(__name__)
15
16
 
17
+
16
18
  class GuardianSearchEngine(BaseSearchEngine):
17
19
  """Enhanced Guardian API search engine implementation with LLM query optimization"""
18
-
19
- def __init__(self,
20
- max_results: int = 10,
21
- api_key: Optional[str] = None,
22
- from_date: Optional[str] = None,
23
- to_date: Optional[str] = None,
24
- section: Optional[str] = None,
25
- order_by: str = "relevance",
26
- llm: Optional[BaseLLM] = None,
27
- max_filtered_results: Optional[int] = None,
28
- optimize_queries: bool = True,
29
- adaptive_search: bool = True):
20
+
21
+ def __init__(
22
+ self,
23
+ max_results: int = 10,
24
+ api_key: Optional[str] = None,
25
+ from_date: Optional[str] = None,
26
+ to_date: Optional[str] = None,
27
+ section: Optional[str] = None,
28
+ order_by: str = "relevance",
29
+ llm: Optional[BaseLLM] = None,
30
+ max_filtered_results: Optional[int] = None,
31
+ optimize_queries: bool = True,
32
+ adaptive_search: bool = True,
33
+ ):
30
34
  """
31
35
  Initialize The Guardian search engine with enhanced features.
32
-
36
+
33
37
  Args:
34
38
  max_results: Maximum number of search results
35
39
  api_key: The Guardian API key (can also be set in GUARDIAN_API_KEY env)
@@ -43,14 +47,18 @@ class GuardianSearchEngine(BaseSearchEngine):
43
47
  adaptive_search: Whether to use adaptive search (adjusting date ranges)
44
48
  """
45
49
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
46
- super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
50
+ super().__init__(
51
+ llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
52
+ )
47
53
  self.api_key = api_key or os.getenv("GUARDIAN_API_KEY")
48
54
  self.optimize_queries = optimize_queries
49
55
  self.adaptive_search = adaptive_search
50
-
56
+
51
57
  if not self.api_key:
52
- raise ValueError("Guardian API key not found. Please provide api_key or set the GUARDIAN_API_KEY environment variable.")
53
-
58
+ raise ValueError(
59
+ "Guardian API key not found. Please provide api_key or set the GUARDIAN_API_KEY environment variable."
60
+ )
61
+
54
62
  # Set date ranges if not provided
55
63
  if not from_date:
56
64
  # Default to one month ago
@@ -58,44 +66,46 @@ class GuardianSearchEngine(BaseSearchEngine):
58
66
  self.from_date = one_month_ago.strftime("%Y-%m-%d")
59
67
  else:
60
68
  self.from_date = from_date
61
-
69
+
62
70
  if not to_date:
63
71
  # Default to today
64
72
  self.to_date = datetime.now().strftime("%Y-%m-%d")
65
73
  else:
66
74
  self.to_date = to_date
67
-
75
+
68
76
  self.section = section
69
77
  self.order_by = order_by
70
78
  self._original_date_params = {
71
79
  "from_date": self.from_date,
72
- "to_date": self.to_date
80
+ "to_date": self.to_date,
73
81
  }
74
-
82
+
75
83
  # API base URL
76
84
  self.api_url = "https://content.guardianapis.com/search"
77
-
85
+
78
86
  def _optimize_query_for_guardian(self, query: str) -> str:
79
87
  """
80
88
  Optimize a natural language query for Guardian search.
81
89
  Uses LLM to transform questions into effective news search queries.
82
-
90
+
83
91
  Args:
84
92
  query: Natural language query
85
-
93
+
86
94
  Returns:
87
95
  Optimized query string for Guardian
88
96
  """
89
97
  # Handle extremely long queries by truncating first
90
98
  if len(query) > 150:
91
99
  simple_query = " ".join(query.split()[:10])
92
- logger.info(f"Query too long ({len(query)} chars), truncating to: {simple_query}")
100
+ logger.info(
101
+ f"Query too long ({len(query)} chars), truncating to: {simple_query}"
102
+ )
93
103
  query = simple_query
94
-
104
+
95
105
  if not self.llm or not self.optimize_queries:
96
106
  # Return original query if no LLM available or optimization disabled
97
107
  return query
98
-
108
+
99
109
  try:
100
110
  # Prompt for query optimization
101
111
  prompt = f"""Transform this natural language question into a very short Guardian news search query.
@@ -119,36 +129,42 @@ EXAMPLE CONVERSIONS:
119
129
 
120
130
  Return ONLY the extremely brief search query.
121
131
  """
122
-
132
+
123
133
  # Get response from LLM
124
134
  response = self.llm.invoke(prompt)
125
135
  optimized_query = remove_think_tags(response.content).strip()
126
-
136
+
127
137
  # Clean up the query - remove any explanations
128
- lines = optimized_query.split('\n')
138
+ lines = optimized_query.split("\n")
129
139
  for line in lines:
130
140
  line = line.strip()
131
- if line and not line.lower().startswith(('here', 'i would', 'the best', 'this query')):
141
+ if line and not line.lower().startswith(
142
+ ("here", "i would", "the best", "this query")
143
+ ):
132
144
  optimized_query = line
133
145
  break
134
-
146
+
135
147
  # Remove any quotes that wrap the entire query
136
- if optimized_query.startswith('"') and optimized_query.endswith('"') and optimized_query.count('"') == 2:
148
+ if (
149
+ optimized_query.startswith('"')
150
+ and optimized_query.endswith('"')
151
+ and optimized_query.count('"') == 2
152
+ ):
137
153
  optimized_query = optimized_query[1:-1]
138
-
154
+
139
155
  logger.info(f"Original query: '{query}'")
140
156
  logger.info(f"Optimized for Guardian: '{optimized_query}'")
141
-
157
+
142
158
  return optimized_query
143
-
159
+
144
160
  except Exception as e:
145
161
  logger.error(f"Error optimizing query: {e}")
146
162
  return query # Fall back to original query on error
147
-
163
+
148
164
  def _adapt_dates_for_query_type(self, query: str) -> None:
149
165
  """
150
166
  Adapt date range based on query type (historical vs current).
151
-
167
+
152
168
  Args:
153
169
  query: The search query
154
170
  """
@@ -160,10 +176,10 @@ Return ONLY the extremely brief search query.
160
176
  self.from_date = recent
161
177
  self.order_by = "newest"
162
178
  return
163
-
179
+
164
180
  if not self.llm or not self.adaptive_search:
165
181
  return
166
-
182
+
167
183
  try:
168
184
  prompt = f"""Is this query asking about HISTORICAL events or CURRENT events?
169
185
 
@@ -175,103 +191,111 @@ ONE WORD ANSWER ONLY:
175
191
  - "UNCLEAR" if can't determine
176
192
 
177
193
  ONE WORD ONLY:"""
178
-
194
+
179
195
  response = self.llm.invoke(prompt)
180
196
  answer = remove_think_tags(response.content).strip().upper()
181
-
197
+
182
198
  # Reset to original parameters first
183
199
  self.from_date = self._original_date_params["from_date"]
184
200
  self.to_date = self._original_date_params["to_date"]
185
-
201
+
186
202
  if "HISTORICAL" in answer:
187
203
  # For historical queries, go back 10 years
188
- logger.info("Query classified as HISTORICAL - extending search timeframe")
189
- ten_years_ago = (datetime.now() - timedelta(days=3650)).strftime("%Y-%m-%d")
204
+ logger.info(
205
+ "Query classified as HISTORICAL - extending search timeframe"
206
+ )
207
+ ten_years_ago = (datetime.now() - timedelta(days=3650)).strftime(
208
+ "%Y-%m-%d"
209
+ )
190
210
  self.from_date = ten_years_ago
191
-
211
+
192
212
  elif "CURRENT" in answer:
193
213
  # For current events, focus on recent content
194
214
  logger.info("Query classified as CURRENT - focusing on recent content")
195
215
  recent = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
196
216
  self.from_date = recent
197
217
  self.order_by = "newest" # Prioritize newest for current events
198
-
218
+
199
219
  except Exception as e:
200
220
  logger.error(f"Error adapting dates for query type: {e}")
201
221
  # Keep original date parameters on error
202
-
222
+
203
223
  def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
204
224
  """
205
225
  Perform adaptive search that progressively adjusts parameters based on results.
206
-
226
+
207
227
  Args:
208
228
  query: The search query
209
-
229
+
210
230
  Returns:
211
231
  Tuple of (list of articles, search strategy used)
212
232
  """
213
233
  # Try with current parameters
214
234
  articles = self._get_all_data(query)
215
235
  strategy = "initial"
216
-
236
+
217
237
  # If no results or too few, try different strategies
218
238
  if len(articles) < 3 and self.adaptive_search:
219
- logger.info(f"Initial search found only {len(articles)} results, trying alternative strategies")
220
-
239
+ logger.info(
240
+ f"Initial search found only {len(articles)} results, trying alternative strategies"
241
+ )
242
+
221
243
  # Try with expanded date range
222
244
  original_from_date = self.from_date
223
245
  original_order_by = self.order_by
224
-
246
+
225
247
  # Strategy 1: Expand to 6 months
226
248
  logger.info("Strategy 1: Expanding time range to 6 months")
227
249
  six_months_ago = (datetime.now() - timedelta(days=180)).strftime("%Y-%m-%d")
228
250
  self.from_date = six_months_ago
229
-
251
+
230
252
  articles1 = self._get_all_data(query)
231
253
  if len(articles1) > len(articles):
232
254
  articles = articles1
233
255
  strategy = "expanded_6mo"
234
-
256
+
235
257
  # Strategy 2: Expand to all time and try relevance order
236
258
  if len(articles) < 3:
237
259
  logger.info("Strategy 2: Expanding to all time with relevance ordering")
238
260
  self.from_date = "2000-01-01" # Effectively "all time"
239
261
  self.order_by = "relevance"
240
-
262
+
241
263
  articles2 = self._get_all_data(query)
242
264
  if len(articles2) > len(articles):
243
265
  articles = articles2
244
266
  strategy = "all_time_relevance"
245
-
267
+
246
268
  # Strategy 3: Try removing section constraints
247
269
  if len(articles) < 3 and self.section:
248
270
  logger.info("Strategy 3: Removing section constraint")
249
271
  original_section = self.section
250
272
  self.section = None
251
-
273
+
252
274
  articles3 = self._get_all_data(query)
253
275
  if len(articles3) > len(articles):
254
276
  articles = articles3
255
277
  strategy = "no_section"
256
-
278
+
257
279
  # Restore section setting
258
280
  self.section = original_section
259
-
281
+
260
282
  # Restore original settings
261
283
  self.from_date = original_from_date
262
284
  self.order_by = original_order_by
263
-
264
- logger.info(f"Adaptive search using strategy '{strategy}' found {len(articles)} results")
285
+
286
+ logger.info(
287
+ f"Adaptive search using strategy '{strategy}' found {len(articles)} results"
288
+ )
265
289
  return articles, strategy
266
-
290
+
267
291
  def _get_all_data(self, query: str) -> List[Dict[str, Any]]:
268
292
  """
269
293
  Get all article data from The Guardian API in a single call.
270
294
  Always requests all fields for simplicity.
271
-
295
+
272
296
  Args:
273
297
  query: The search query
274
-
298
+
275
299
  Returns:
276
300
  List of articles with all data
277
301
  """
@@ -280,20 +304,24 @@ ONE WORD ONLY:"""
280
304
  if not query or query.strip() == "":
281
305
  query = "news"
282
306
  logger.warning("Empty query provided, using 'news' as default")
283
-
307
+
284
308
  # Ensure query is not too long for API
285
309
  if len(query) > 100:
286
- logger.warning(f"Query too long for Guardian API ({len(query)} chars), truncating")
310
+ logger.warning(
311
+ f"Query too long for Guardian API ({len(query)} chars), truncating"
312
+ )
287
313
  query = query[:100]
288
-
314
+
289
315
  # Always request all fields for simplicity
290
316
  # Ensure max_results is an integer to avoid comparison errors
291
- page_size = min(int(self.max_results) if self.max_results is not None else 10, 50)
292
-
317
+ page_size = min(
318
+ int(self.max_results) if self.max_results is not None else 10, 50
319
+ )
320
+
293
321
  # Log full parameters for debugging
294
322
  logger.info(f"Guardian API search query: '{query}'")
295
323
  logger.info(f"Guardian API date range: {self.from_date} to {self.to_date}")
296
-
324
+
297
325
  params = {
298
326
  "q": query,
299
327
  "api-key": self.api_key,
@@ -302,36 +330,36 @@ ONE WORD ONLY:"""
302
330
  "order-by": self.order_by,
303
331
  "page-size": page_size, # API maximum is 50
304
332
  "show-fields": "headline,trailText,byline,body,publication",
305
- "show-tags": "keyword"
333
+ "show-tags": "keyword",
306
334
  }
307
-
335
+
308
336
  # Add section filter if specified
309
337
  if self.section:
310
338
  params["section"] = self.section
311
-
339
+
312
340
  # Log the complete request parameters (except API key)
313
341
  log_params = params.copy()
314
342
  log_params["api-key"] = "REDACTED"
315
343
  logger.info(f"Guardian API request parameters: {log_params}")
316
-
344
+
317
345
  # Execute the API request
318
346
  response = requests.get(self.api_url, params=params)
319
347
  response.raise_for_status()
320
-
348
+
321
349
  data = response.json()
322
-
350
+
323
351
  # Extract results from the response
324
352
  articles = data.get("response", {}).get("results", [])
325
353
  logger.info(f"Guardian API returned {len(articles)} articles")
326
-
354
+
327
355
  # Format results to include all data
328
356
  formatted_articles = []
329
357
  for i, article in enumerate(articles):
330
358
  if i >= self.max_results:
331
359
  break
332
-
360
+
333
361
  fields = article.get("fields", {})
334
-
362
+
335
363
  # Format the article with all fields
336
364
  result = {
337
365
  "id": article.get("id", ""),
@@ -342,42 +370,46 @@ ONE WORD ONLY:"""
342
370
  "section": article.get("sectionName", ""),
343
371
  "author": fields.get("byline", ""),
344
372
  "content": fields.get("body", ""),
345
- "full_content": fields.get("body", "")
373
+ "full_content": fields.get("body", ""),
346
374
  }
347
-
375
+
348
376
  # Extract tags/keywords
349
377
  tags = article.get("tags", [])
350
- result["keywords"] = [tag.get("webTitle", "") for tag in tags if tag.get("type") == "keyword"]
351
-
378
+ result["keywords"] = [
379
+ tag.get("webTitle", "")
380
+ for tag in tags
381
+ if tag.get("type") == "keyword"
382
+ ]
383
+
352
384
  formatted_articles.append(result)
353
-
385
+
354
386
  return formatted_articles
355
-
387
+
356
388
  except Exception as e:
357
389
  logger.error(f"Error getting data from The Guardian API: {e}")
358
390
  return []
359
-
391
+
360
392
  def _get_previews(self, query: str) -> List[Dict[str, Any]]:
361
393
  """
362
394
  Get preview information for Guardian articles with enhanced optimization.
363
-
395
+
364
396
  Args:
365
397
  query: The search query
366
-
398
+
367
399
  Returns:
368
400
  List of preview dictionaries
369
401
  """
370
402
  logger.info(f"Getting articles from The Guardian API for query: {query}")
371
-
403
+
372
404
  # Step 1: Optimize the query using LLM
373
405
  optimized_query = self._optimize_query_for_guardian(query)
374
-
406
+
375
407
  # Step 2: Adapt date parameters based on query type
376
408
  self._adapt_dates_for_query_type(optimized_query)
377
-
409
+
378
410
  # Step 3: Perform adaptive search
379
411
  articles, strategy = self._adaptive_search(optimized_query)
380
-
412
+
381
413
  # Store search metadata for debugging
382
414
  self._search_metadata = {
383
415
  "original_query": query,
@@ -386,12 +418,12 @@ ONE WORD ONLY:"""
386
418
  "from_date": self.from_date,
387
419
  "to_date": self.to_date,
388
420
  "section": self.section,
389
- "order_by": self.order_by
421
+ "order_by": self.order_by,
390
422
  }
391
-
423
+
392
424
  # Store full articles for later use
393
425
  self._full_articles = {a["id"]: a for a in articles}
394
-
426
+
395
427
  # Return only preview fields for each article
396
428
  previews = []
397
429
  for article in articles:
@@ -403,70 +435,79 @@ ONE WORD ONLY:"""
403
435
  "publication_date": article["publication_date"],
404
436
  "section": article["section"],
405
437
  "author": article["author"],
406
- "keywords": article.get("keywords", [])
438
+ "keywords": article.get("keywords", []),
407
439
  }
408
440
  previews.append(preview)
409
-
441
+
410
442
  return previews
411
-
412
- def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
443
+
444
+ def _get_full_content(
445
+ self, relevant_items: List[Dict[str, Any]]
446
+ ) -> List[Dict[str, Any]]:
413
447
  """
414
448
  Get full content for the relevant Guardian articles.
415
449
  Restores full content from the cached data.
416
-
450
+
417
451
  Args:
418
452
  relevant_items: List of relevant preview dictionaries
419
-
453
+
420
454
  Returns:
421
455
  List of result dictionaries with full content
422
456
  """
423
- logger.info(f"Adding full content to {len(relevant_items)} relevant Guardian articles")
424
-
457
+ logger.info(
458
+ f"Adding full content to {len(relevant_items)} relevant Guardian articles"
459
+ )
460
+
425
461
  # Check if we should add full content
426
- if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
462
+ if (
463
+ hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
464
+ and search_config.SEARCH_SNIPPETS_ONLY
465
+ ):
427
466
  return relevant_items
428
-
467
+
429
468
  # Get full articles for relevant items
430
469
  results = []
431
470
  for item in relevant_items:
432
471
  article_id = item.get("id", "")
433
-
472
+
434
473
  # Get the full article from our cache
435
- if hasattr(self, '_full_articles') and article_id in self._full_articles:
474
+ if hasattr(self, "_full_articles") and article_id in self._full_articles:
436
475
  results.append(self._full_articles[article_id])
437
476
  else:
438
477
  # If not found (shouldn't happen), just use the preview
439
478
  results.append(item)
440
-
479
+
441
480
  return results
442
-
481
+
443
482
  def run(self, query: str) -> List[Dict[str, Any]]:
444
483
  """
445
484
  Execute a search using The Guardian API with the enhanced approach.
446
-
485
+
447
486
  Args:
448
487
  query: The search query
449
-
488
+
450
489
  Returns:
451
490
  List of search results
452
491
  """
453
- logger.info(f"---Execute a search using The Guardian (enhanced)---")
454
-
492
+ logger.info("---Execute a search using The Guardian (enhanced)---")
493
+
455
494
  # Additional safety check for None query
456
495
  if query is None:
457
496
  logger.error("None query passed to Guardian search engine")
458
497
  query = "news"
459
-
498
+
460
499
  try:
461
500
  # Get previews with our enhanced method
462
501
  previews = self._get_previews(query)
463
-
502
+
464
503
  # If no results, try one more time with a simplified query
465
504
  if not previews:
466
505
  simple_query = " ".join([w for w in query.split() if len(w) > 3][:3])
467
- logger.warning(f"No Guardian articles found, trying simplified query: {simple_query}")
506
+ logger.warning(
507
+ f"No Guardian articles found, trying simplified query: {simple_query}"
508
+ )
468
509
  previews = self._get_previews(simple_query)
469
-
510
+
470
511
  # If still no results, try with a very generic query as last resort
471
512
  if not previews and "trump" in query.lower():
472
513
  logger.warning("Trying last resort query: 'Donald Trump'")
@@ -474,109 +515,121 @@ ONE WORD ONLY:"""
474
515
  elif not previews:
475
516
  logger.warning("Trying last resort query: 'news'")
476
517
  previews = self._get_previews("news")
477
-
518
+
478
519
  # If still no results after all attempts, return empty list
479
520
  if not previews:
480
- logger.warning(f"No Guardian articles found after multiple attempts")
521
+ logger.warning("No Guardian articles found after multiple attempts")
481
522
  return []
482
-
523
+
483
524
  # Filter for relevance if we have an LLM
484
- if self.llm and hasattr(self, 'max_filtered_results') and self.max_filtered_results:
525
+ if (
526
+ self.llm
527
+ and hasattr(self, "max_filtered_results")
528
+ and self.max_filtered_results
529
+ ):
485
530
  filtered_items = self._filter_for_relevance(previews, query)
486
531
  if not filtered_items:
487
532
  # Fall back to unfiltered results if everything was filtered out
488
- logger.warning("All articles filtered out, using unfiltered results")
489
- filtered_items = previews[:self.max_filtered_results]
533
+ logger.warning(
534
+ "All articles filtered out, using unfiltered results"
535
+ )
536
+ filtered_items = previews[: self.max_filtered_results]
490
537
  else:
491
538
  filtered_items = previews
492
-
539
+
493
540
  # Get full content for relevant items
494
541
  results = self._get_full_content(filtered_items)
495
-
542
+
496
543
  # Add source information to make it clear these are from The Guardian
497
544
  for result in results:
498
545
  if "source" not in result:
499
546
  result["source"] = "The Guardian"
500
-
547
+
501
548
  # Clean up the cache after use
502
- if hasattr(self, '_full_articles'):
549
+ if hasattr(self, "_full_articles"):
503
550
  del self._full_articles
504
-
551
+
505
552
  # Restore original date parameters
506
553
  self.from_date = self._original_date_params["from_date"]
507
554
  self.to_date = self._original_date_params["to_date"]
508
-
555
+
509
556
  # Log search metadata if available
510
- if hasattr(self, '_search_metadata'):
557
+ if hasattr(self, "_search_metadata"):
511
558
  logger.info(f"Search metadata: {self._search_metadata}")
512
559
  del self._search_metadata
513
-
560
+
514
561
  return results
515
-
562
+
516
563
  except Exception as e:
517
564
  logger.error(f"Error in Guardian search: {e}")
518
-
565
+
519
566
  # Restore original date parameters on error
520
567
  self.from_date = self._original_date_params["from_date"]
521
568
  self.to_date = self._original_date_params["to_date"]
522
-
569
+
523
570
  return []
524
-
525
- def search_by_section(self, section: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
571
+
572
+ def search_by_section(
573
+ self, section: str, max_results: Optional[int] = None
574
+ ) -> List[Dict[str, Any]]:
526
575
  """
527
576
  Search for articles in a specific section.
528
-
577
+
529
578
  Args:
530
579
  section: The Guardian section name (e.g., "politics", "technology")
531
580
  max_results: Maximum number of results (defaults to self.max_results)
532
-
581
+
533
582
  Returns:
534
583
  List of articles in the section
535
584
  """
536
585
  original_section = self.section
537
586
  original_max_results = self.max_results
538
-
587
+
539
588
  try:
540
589
  # Set section and max_results for this search
541
590
  self.section = section
542
591
  if max_results:
543
592
  self.max_results = max_results
544
-
593
+
545
594
  # Use empty query to get all articles in the section
546
595
  return self.run("")
547
-
596
+
548
597
  finally:
549
598
  # Restore original values
550
599
  self.section = original_section
551
600
  self.max_results = original_max_results
552
-
553
- def get_recent_articles(self, days: int = 7, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
601
+
602
+ def get_recent_articles(
603
+ self, days: int = 7, max_results: Optional[int] = None
604
+ ) -> List[Dict[str, Any]]:
554
605
  """
555
606
  Get recent articles from The Guardian.
556
-
607
+
557
608
  Args:
558
609
  days: Number of days to look back
559
610
  max_results: Maximum number of results (defaults to self.max_results)
560
-
611
+
561
612
  Returns:
562
613
  List of recent articles
563
614
  """
564
615
  original_from_date = self.from_date
565
616
  original_order_by = self.order_by
566
617
  original_max_results = self.max_results
567
-
618
+
568
619
  try:
569
620
  # Set parameters for this search
570
- self.from_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
621
+ self.from_date = (datetime.now() - timedelta(days=days)).strftime(
622
+ "%Y-%m-%d"
623
+ )
571
624
  self.order_by = "newest"
572
625
  if max_results:
573
626
  self.max_results = max_results
574
-
627
+
575
628
  # Use empty query to get all recent articles
576
629
  return self.run("")
577
-
630
+
578
631
  finally:
579
632
  # Restore original values
580
633
  self.from_date = original_from_date
581
634
  self.order_by = original_order_by
582
- self.max_results = original_max_results
635
+ self.max_results = original_max_results