local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. local_deep_research/__init__.py +23 -22
  2. local_deep_research/__main__.py +16 -0
  3. local_deep_research/advanced_search_system/__init__.py +7 -0
  4. local_deep_research/advanced_search_system/filters/__init__.py +8 -0
  5. local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
  6. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
  7. local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
  8. local_deep_research/advanced_search_system/findings/repository.py +452 -0
  9. local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
  10. local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
  11. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
  12. local_deep_research/advanced_search_system/questions/__init__.py +1 -0
  13. local_deep_research/advanced_search_system/questions/base_question.py +64 -0
  14. local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
  15. local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
  16. local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
  17. local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
  18. local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
  19. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
  20. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
  21. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
  22. local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
  23. local_deep_research/advanced_search_system/tools/__init__.py +1 -0
  24. local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
  25. local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
  26. local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
  27. local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
  28. local_deep_research/api/__init__.py +5 -5
  29. local_deep_research/api/research_functions.py +154 -160
  30. local_deep_research/app.py +8 -0
  31. local_deep_research/citation_handler.py +25 -16
  32. local_deep_research/{config.py → config/config_files.py} +102 -110
  33. local_deep_research/config/llm_config.py +472 -0
  34. local_deep_research/config/search_config.py +77 -0
  35. local_deep_research/defaults/__init__.py +10 -5
  36. local_deep_research/defaults/main.toml +2 -2
  37. local_deep_research/defaults/search_engines.toml +60 -34
  38. local_deep_research/main.py +121 -19
  39. local_deep_research/migrate_db.py +147 -0
  40. local_deep_research/report_generator.py +87 -45
  41. local_deep_research/search_system.py +153 -283
  42. local_deep_research/setup_data_dir.py +35 -0
  43. local_deep_research/test_migration.py +178 -0
  44. local_deep_research/utilities/__init__.py +0 -0
  45. local_deep_research/utilities/db_utils.py +49 -0
  46. local_deep_research/{utilties → utilities}/enums.py +2 -2
  47. local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
  48. local_deep_research/utilities/search_utilities.py +242 -0
  49. local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
  50. local_deep_research/web/__init__.py +0 -1
  51. local_deep_research/web/app.py +86 -1709
  52. local_deep_research/web/app_factory.py +289 -0
  53. local_deep_research/web/database/README.md +70 -0
  54. local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
  55. local_deep_research/web/database/migrations.py +447 -0
  56. local_deep_research/web/database/models.py +117 -0
  57. local_deep_research/web/database/schema_upgrade.py +107 -0
  58. local_deep_research/web/models/database.py +294 -0
  59. local_deep_research/web/models/settings.py +94 -0
  60. local_deep_research/web/routes/api_routes.py +559 -0
  61. local_deep_research/web/routes/history_routes.py +354 -0
  62. local_deep_research/web/routes/research_routes.py +715 -0
  63. local_deep_research/web/routes/settings_routes.py +1583 -0
  64. local_deep_research/web/services/research_service.py +947 -0
  65. local_deep_research/web/services/resource_service.py +149 -0
  66. local_deep_research/web/services/settings_manager.py +669 -0
  67. local_deep_research/web/services/settings_service.py +187 -0
  68. local_deep_research/web/services/socket_service.py +210 -0
  69. local_deep_research/web/static/css/custom_dropdown.css +277 -0
  70. local_deep_research/web/static/css/settings.css +1223 -0
  71. local_deep_research/web/static/css/styles.css +525 -48
  72. local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
  73. local_deep_research/web/static/js/components/detail.js +348 -0
  74. local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
  75. local_deep_research/web/static/js/components/fallback/ui.js +215 -0
  76. local_deep_research/web/static/js/components/history.js +487 -0
  77. local_deep_research/web/static/js/components/logpanel.js +949 -0
  78. local_deep_research/web/static/js/components/progress.js +1107 -0
  79. local_deep_research/web/static/js/components/research.js +1865 -0
  80. local_deep_research/web/static/js/components/results.js +766 -0
  81. local_deep_research/web/static/js/components/settings.js +3981 -0
  82. local_deep_research/web/static/js/components/settings_sync.js +106 -0
  83. local_deep_research/web/static/js/main.js +226 -0
  84. local_deep_research/web/static/js/services/api.js +253 -0
  85. local_deep_research/web/static/js/services/audio.js +31 -0
  86. local_deep_research/web/static/js/services/formatting.js +119 -0
  87. local_deep_research/web/static/js/services/pdf.js +622 -0
  88. local_deep_research/web/static/js/services/socket.js +882 -0
  89. local_deep_research/web/static/js/services/ui.js +546 -0
  90. local_deep_research/web/templates/base.html +72 -0
  91. local_deep_research/web/templates/components/custom_dropdown.html +47 -0
  92. local_deep_research/web/templates/components/log_panel.html +32 -0
  93. local_deep_research/web/templates/components/mobile_nav.html +22 -0
  94. local_deep_research/web/templates/components/settings_form.html +299 -0
  95. local_deep_research/web/templates/components/sidebar.html +21 -0
  96. local_deep_research/web/templates/pages/details.html +73 -0
  97. local_deep_research/web/templates/pages/history.html +51 -0
  98. local_deep_research/web/templates/pages/progress.html +57 -0
  99. local_deep_research/web/templates/pages/research.html +139 -0
  100. local_deep_research/web/templates/pages/results.html +59 -0
  101. local_deep_research/web/templates/settings_dashboard.html +78 -192
  102. local_deep_research/web/utils/__init__.py +0 -0
  103. local_deep_research/web/utils/formatters.py +76 -0
  104. local_deep_research/web_search_engines/engines/full_search.py +18 -16
  105. local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
  106. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
  107. local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
  108. local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
  109. local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
  110. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
  111. local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
  112. local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
  113. local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
  114. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
  115. local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
  116. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
  117. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
  118. local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
  119. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
  120. local_deep_research/web_search_engines/search_engine_base.py +174 -99
  121. local_deep_research/web_search_engines/search_engine_factory.py +192 -102
  122. local_deep_research/web_search_engines/search_engines_config.py +22 -15
  123. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
  124. local_deep_research-0.2.2.dist-info/RECORD +135 -0
  125. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
  126. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
  127. local_deep_research/defaults/llm_config.py +0 -338
  128. local_deep_research/utilties/search_utilities.py +0 -114
  129. local_deep_research/web/static/js/app.js +0 -3763
  130. local_deep_research/web/templates/api_keys_config.html +0 -82
  131. local_deep_research/web/templates/collections_config.html +0 -90
  132. local_deep_research/web/templates/index.html +0 -348
  133. local_deep_research/web/templates/llm_config.html +0 -120
  134. local_deep_research/web/templates/main_config.html +0 -89
  135. local_deep_research/web/templates/search_engines_config.html +0 -154
  136. local_deep_research/web/templates/settings.html +0 -519
  137. local_deep_research-0.1.26.dist-info/RECORD +0 -61
  138. local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
  139. /local_deep_research/{utilties → config}/__init__.py +0 -0
  140. {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,48 +1,50 @@
1
- import requests
2
1
  import logging
3
- import json
4
- from typing import Dict, List, Any, Optional, Tuple, Union
5
- from langchain_core.language_models import BaseLLM
6
- import time
7
2
  import re
8
- from datetime import datetime
3
+ import time
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ import requests
7
+ from langchain_core.language_models import BaseLLM
9
8
  from requests.adapters import HTTPAdapter
10
9
  from urllib3.util import Retry
11
10
 
12
- from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
13
- from local_deep_research import config
11
+ from ...config import search_config
12
+ from ..search_engine_base import BaseSearchEngine
14
13
 
15
14
  # Setup logging
16
15
  logging.basicConfig(level=logging.INFO)
17
16
  logger = logging.getLogger(__name__)
18
17
 
18
+
19
19
  class SemanticScholarSearchEngine(BaseSearchEngine):
20
20
  """
21
21
  Semantic Scholar search engine implementation with two-phase approach.
22
22
  Provides efficient access to scientific literature across all fields.
23
23
  """
24
-
25
- def __init__(self,
26
- max_results: int = 10,
27
- api_key: Optional[str] = None,
28
- year_range: Optional[Tuple[int, int]] = None,
29
- get_abstracts: bool = True,
30
- get_references: bool = False,
31
- get_citations: bool = False,
32
- get_embeddings: bool = False,
33
- get_tldr: bool = True,
34
- citation_limit: int = 10,
35
- reference_limit: int = 10,
36
- llm: Optional[BaseLLM] = None,
37
- max_filtered_results: Optional[int] = None,
38
- optimize_queries: bool = True,
39
- max_retries: int = 5,
40
- retry_backoff_factor: float = 1.0,
41
- fields_of_study: Optional[List[str]] = None,
42
- publication_types: Optional[List[str]] = None):
24
+
25
+ def __init__(
26
+ self,
27
+ max_results: int = 10,
28
+ api_key: Optional[str] = None,
29
+ year_range: Optional[Tuple[int, int]] = None,
30
+ get_abstracts: bool = True,
31
+ get_references: bool = False,
32
+ get_citations: bool = False,
33
+ get_embeddings: bool = False,
34
+ get_tldr: bool = True,
35
+ citation_limit: int = 10,
36
+ reference_limit: int = 10,
37
+ llm: Optional[BaseLLM] = None,
38
+ max_filtered_results: Optional[int] = None,
39
+ optimize_queries: bool = True,
40
+ max_retries: int = 5,
41
+ retry_backoff_factor: float = 1.0,
42
+ fields_of_study: Optional[List[str]] = None,
43
+ publication_types: Optional[List[str]] = None,
44
+ ):
43
45
  """
44
46
  Initialize the Semantic Scholar search engine.
45
-
47
+
46
48
  Args:
47
49
  max_results: Maximum number of search results
48
50
  api_key: Semantic Scholar API key for higher rate limits (optional)
@@ -63,8 +65,10 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
63
65
  publication_types: List of publication types to filter results
64
66
  """
65
67
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
66
- super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
67
-
68
+ super().__init__(
69
+ llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
70
+ )
71
+
68
72
  self.api_key = api_key
69
73
  self.year_range = year_range
70
74
  self.get_abstracts = get_abstracts
@@ -79,71 +83,76 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
79
83
  self.retry_backoff_factor = retry_backoff_factor
80
84
  self.fields_of_study = fields_of_study
81
85
  self.publication_types = publication_types
82
-
86
+
83
87
  # Base API URLs
84
88
  self.base_url = "https://api.semanticscholar.org/graph/v1"
85
89
  self.paper_search_url = f"{self.base_url}/paper/search"
86
90
  self.paper_details_url = f"{self.base_url}/paper"
87
-
91
+
88
92
  # Create a session with retry capabilities
89
93
  self.session = self._create_session()
90
-
94
+
91
95
  # Rate limiting
92
96
  self.rate_limit_wait = 1.0 # Default 1 second between requests
93
97
  self.last_request_time = 0
94
-
98
+
95
99
  def _create_session(self) -> requests.Session:
96
100
  """Create and configure a requests session with retry capabilities"""
97
101
  session = requests.Session()
98
-
102
+
99
103
  # Configure automatic retries with exponential backoff
100
104
  retry_strategy = Retry(
101
105
  total=self.max_retries,
102
106
  backoff_factor=self.retry_backoff_factor,
103
107
  status_forcelist=[429, 500, 502, 503, 504],
104
- allowed_methods={"HEAD", "GET", "POST", "OPTIONS"}
108
+ allowed_methods={"HEAD", "GET", "POST", "OPTIONS"},
105
109
  )
106
-
110
+
107
111
  adapter = HTTPAdapter(max_retries=retry_strategy)
108
112
  session.mount("https://", adapter)
109
-
113
+
110
114
  # Set up headers
111
115
  headers = {"Accept": "application/json"}
112
116
  if self.api_key:
113
117
  headers["x-api-key"] = self.api_key
114
-
118
+
115
119
  session.headers.update(headers)
116
-
120
+
117
121
  return session
118
-
122
+
119
123
  def _respect_rate_limit(self):
120
124
  """Apply rate limiting between requests"""
121
125
  current_time = time.time()
122
126
  elapsed = current_time - self.last_request_time
123
-
127
+
124
128
  if elapsed < self.rate_limit_wait:
125
129
  wait_time = self.rate_limit_wait - elapsed
126
- logger.debug(f"Rate limiting: waiting {wait_time:.2f}s")
130
+ logger.debug("Rate limiting: waiting %.2f s", wait_time)
127
131
  time.sleep(wait_time)
128
-
132
+
129
133
  self.last_request_time = time.time()
130
-
131
- def _make_request(self, url: str, params: Optional[Dict] = None, data: Optional[Dict] = None,
132
- method: str = "GET") -> Dict:
134
+
135
+ def _make_request(
136
+ self,
137
+ url: str,
138
+ params: Optional[Dict] = None,
139
+ data: Optional[Dict] = None,
140
+ method: str = "GET",
141
+ ) -> Dict:
133
142
  """
134
143
  Make a request to the Semantic Scholar API.
135
-
144
+
136
145
  Args:
137
146
  url: API endpoint URL
138
147
  params: Query parameters
139
148
  data: JSON data for POST requests
140
149
  method: HTTP method (GET or POST)
141
-
150
+
142
151
  Returns:
143
152
  API response as dictionary
144
153
  """
145
154
  self._respect_rate_limit()
146
-
155
+
147
156
  try:
148
157
  if method.upper() == "GET":
149
158
  response = self.session.get(url, params=params, timeout=30)
@@ -151,34 +160,34 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
151
160
  response = self.session.post(url, params=params, json=data, timeout=30)
152
161
  else:
153
162
  raise ValueError(f"Unsupported HTTP method: {method}")
154
-
163
+
155
164
  # Handle rate limiting manually if retry strategy fails
156
165
  if response.status_code == 429:
157
166
  logger.warning("Rate limit exceeded, waiting and retrying...")
158
167
  time.sleep(2.0) # Wait longer on rate limit
159
168
  self.rate_limit_wait *= 1.5 # Increase wait time for future requests
160
169
  return self._make_request(url, params, data, method) # Retry
161
-
170
+
162
171
  response.raise_for_status()
163
172
  return response.json()
164
173
  except requests.RequestException as e:
165
174
  logger.error(f"API request failed: {e}")
166
175
  return {}
167
-
176
+
168
177
  def _optimize_query(self, query: str) -> str:
169
178
  """
170
179
  Optimize a natural language query for Semantic Scholar search.
171
180
  If LLM is available, uses it to extract key terms and concepts.
172
-
181
+
173
182
  Args:
174
183
  query: Natural language query
175
-
184
+
176
185
  Returns:
177
186
  Optimized query string
178
187
  """
179
188
  if not self.llm or not self.optimize_queries:
180
189
  return query
181
-
190
+
182
191
  try:
183
192
  prompt = f"""Transform this natural language question into an optimized academic search query.
184
193
 
@@ -198,113 +207,122 @@ EXAMPLE TRANSFORMATIONS:
198
207
 
199
208
  Return ONLY the optimized search query with no explanation.
200
209
  """
201
-
210
+
202
211
  response = self.llm.invoke(prompt)
203
212
  optimized_query = response.content.strip()
204
-
213
+
205
214
  # Clean up the query - remove any explanations
206
- lines = optimized_query.split('\n')
215
+ lines = optimized_query.split("\n")
207
216
  optimized_query = lines[0].strip()
208
-
217
+
209
218
  # Safety check - if query looks too much like an explanation, use original
210
219
  if len(optimized_query.split()) > 15 or ":" in optimized_query:
211
- logger.warning("Query optimization result looks too verbose, using original")
220
+ logger.warning(
221
+ "Query optimization result looks too verbose, using original"
222
+ )
212
223
  return query
213
-
224
+
214
225
  logger.info(f"Original query: '{query}'")
215
226
  logger.info(f"Optimized for search: '{optimized_query}'")
216
-
227
+
217
228
  return optimized_query
218
229
  except Exception as e:
219
230
  logger.error(f"Error optimizing query: {e}")
220
231
  return query # Fall back to original query on error
221
-
232
+
222
233
  def _direct_search(self, query: str) -> List[Dict[str, Any]]:
223
234
  """
224
235
  Make a direct search request to the Semantic Scholar API.
225
-
236
+
226
237
  Args:
227
238
  query: The search query
228
-
239
+
229
240
  Returns:
230
241
  List of paper dictionaries
231
242
  """
232
243
  try:
233
244
  # Configure fields to retrieve
234
245
  fields = [
235
- "paperId",
236
- "externalIds",
237
- "url",
238
- "title",
239
- "abstract",
240
- "venue",
241
- "year",
242
- "authors"
246
+ "paperId",
247
+ "externalIds",
248
+ "url",
249
+ "title",
250
+ "abstract",
251
+ "venue",
252
+ "year",
253
+ "authors",
243
254
  ]
244
-
255
+
245
256
  if self.get_tldr:
246
257
  fields.append("tldr")
247
-
258
+
248
259
  params = {
249
260
  "query": query,
250
261
  "limit": min(self.max_results, 100), # API limit is 100 per request
251
- "fields": ",".join(fields)
262
+ "fields": ",".join(fields),
252
263
  }
253
-
264
+
254
265
  # Add year filter if specified
255
266
  if self.year_range:
256
267
  start_year, end_year = self.year_range
257
268
  params["year"] = f"{start_year}-{end_year}"
258
-
269
+
259
270
  # Add fields of study filter if specified
260
271
  if self.fields_of_study:
261
272
  params["fieldsOfStudy"] = ",".join(self.fields_of_study)
262
-
273
+
263
274
  # Add publication types filter if specified
264
275
  if self.publication_types:
265
276
  params["publicationTypes"] = ",".join(self.publication_types)
266
-
277
+
267
278
  response = self._make_request(self.paper_search_url, params)
268
-
279
+
269
280
  if "data" in response:
270
281
  papers = response["data"]
271
- logger.info(f"Found {len(papers)} papers with direct search for query: '{query}'")
282
+ logger.info(
283
+ f"Found {len(papers)} papers with direct search for query: '{query}'"
284
+ )
272
285
  return papers
273
286
  else:
274
- logger.warning(f"No data in response for direct search query: '{query}'")
287
+ logger.warning(
288
+ f"No data in response for direct search query: '{query}'"
289
+ )
275
290
  return []
276
-
291
+
277
292
  except Exception as e:
278
293
  logger.error(f"Error in direct search: {e}")
279
294
  return []
280
-
295
+
281
296
  def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
282
297
  """
283
298
  Perform an adaptive search that adjusts based on result volume.
284
299
  Uses LLM to generate better fallback queries when available.
285
-
300
+
286
301
  Args:
287
302
  query: The search query
288
-
303
+
289
304
  Returns:
290
305
  Tuple of (list of paper results, search strategy used)
291
306
  """
292
307
  # Start with a standard search
293
308
  papers = self._direct_search(query)
294
309
  strategy = "standard"
295
-
310
+
296
311
  # If no results, try different variations
297
312
  if not papers:
298
313
  # Try removing quotes to broaden search
299
314
  if '"' in query:
300
- unquoted_query = query.replace('"', '')
301
- logger.info(f"No results with quoted terms, trying without quotes: {unquoted_query}")
315
+ unquoted_query = query.replace('"', "")
316
+ logger.info(
317
+ "No results with quoted terms, trying without quotes: %s",
318
+ unquoted_query,
319
+ )
302
320
  papers = self._direct_search(unquoted_query)
303
-
321
+
304
322
  if papers:
305
323
  strategy = "unquoted"
306
324
  return papers, strategy
307
-
325
+
308
326
  # If LLM is available, use it to generate better fallback queries
309
327
  if self.llm:
310
328
  try:
@@ -325,99 +343,109 @@ Format each query on a new line with no numbering or explanation. Keep each quer
325
343
  """
326
344
  # Get the LLM's response
327
345
  response = self.llm.invoke(prompt)
328
-
346
+
329
347
  # Extract the alternative queries
330
348
  alt_queries = []
331
- if hasattr(response, 'content'): # Handle various LLM response formats
349
+ if hasattr(
350
+ response, "content"
351
+ ): # Handle various LLM response formats
332
352
  content = response.content
333
- alt_queries = [q.strip() for q in content.strip().split('\n') if q.strip()]
353
+ alt_queries = [
354
+ q.strip() for q in content.strip().split("\n") if q.strip()
355
+ ]
334
356
  elif isinstance(response, str):
335
- alt_queries = [q.strip() for q in response.strip().split('\n') if q.strip()]
336
-
357
+ alt_queries = [
358
+ q.strip() for q in response.strip().split("\n") if q.strip()
359
+ ]
360
+
337
361
  # Try each alternative query
338
362
  for alt_query in alt_queries[:3]: # Limit to first 3 alternatives
339
- logger.info(f"Trying LLM-suggested query: {alt_query}")
363
+ logger.info("Trying LLM-suggested query: %s", alt_query)
340
364
  alt_papers = self._direct_search(alt_query)
341
-
365
+
342
366
  if alt_papers:
343
- logger.info(f"Found {len(alt_papers)} papers using LLM-suggested query: {alt_query}")
367
+ logger.info(
368
+ "Found %s papers using LLM-suggested query: %s",
369
+ len(alt_papers),
370
+ alt_query,
371
+ )
344
372
  strategy = "llm_alternative"
345
373
  return alt_papers, strategy
346
374
  except Exception as e:
347
- logger.error(f"Error using LLM for query refinement: {e}")
375
+ logger.error("Error using LLM for query refinement: %s", e)
348
376
  # Fall through to simpler strategies
349
-
377
+
350
378
  # Fallback: Try with the longest words (likely specific terms)
351
- words = re.findall(r'\w+', query)
379
+ words = re.findall(r"\w+", query)
352
380
  longer_words = [word for word in words if len(word) > 6]
353
381
  if longer_words:
354
382
  # Use up to 3 of the longest words
355
383
  longer_words = sorted(longer_words, key=len, reverse=True)[:3]
356
- key_terms_query = ' '.join(longer_words)
357
- logger.info(f"Trying with key terms: {key_terms_query}")
384
+ key_terms_query = " ".join(longer_words)
385
+ logger.info("Trying with key terms: %s", key_terms_query)
358
386
  papers = self._direct_search(key_terms_query)
359
-
387
+
360
388
  if papers:
361
389
  strategy = "key_terms"
362
390
  return papers, strategy
363
-
391
+
364
392
  # Final fallback: Try with just the longest word
365
393
  if words:
366
394
  longest_word = max(words, key=len)
367
395
  if len(longest_word) > 5: # Only use if it's reasonably long
368
- logger.info(f"Trying with single key term: {longest_word}")
396
+ logger.info("Trying with single key term: %s", longest_word)
369
397
  papers = self._direct_search(longest_word)
370
-
398
+
371
399
  if papers:
372
400
  strategy = "single_term"
373
401
  return papers, strategy
374
-
402
+
375
403
  return papers, strategy
376
-
404
+
377
405
  def _get_paper_details(self, paper_id: str) -> Dict[str, Any]:
378
406
  """
379
407
  Get detailed information about a specific paper.
380
-
408
+
381
409
  Args:
382
410
  paper_id: Semantic Scholar Paper ID
383
-
411
+
384
412
  Returns:
385
413
  Dictionary with paper details
386
414
  """
387
415
  try:
388
416
  # Construct fields parameter
389
417
  fields = [
390
- "paperId",
391
- "externalIds",
392
- "corpusId",
393
- "url",
394
- "title",
395
- "abstract",
396
- "venue",
397
- "year",
398
- "authors",
399
- "fieldsOfStudy"
418
+ "paperId",
419
+ "externalIds",
420
+ "corpusId",
421
+ "url",
422
+ "title",
423
+ "abstract",
424
+ "venue",
425
+ "year",
426
+ "authors",
427
+ "fieldsOfStudy",
400
428
  ]
401
-
429
+
402
430
  if self.get_tldr:
403
431
  fields.append("tldr")
404
-
432
+
405
433
  if self.get_embeddings:
406
434
  fields.append("embedding")
407
-
435
+
408
436
  # Add citation and reference fields if requested
409
437
  if self.get_citations:
410
438
  fields.append(f"citations.limit({self.citation_limit})")
411
-
439
+
412
440
  if self.get_references:
413
441
  fields.append(f"references.limit({self.reference_limit})")
414
-
442
+
415
443
  # Make the request
416
444
  url = f"{self.paper_details_url}/{paper_id}"
417
445
  params = {"fields": ",".join(fields)}
418
-
446
+
419
447
  return self._make_request(url, params)
420
-
448
+
421
449
  except Exception as e:
422
450
  logger.error(f"Error getting paper details for {paper_id}: {e}")
423
451
  return {}
@@ -425,25 +453,25 @@ Format each query on a new line with no numbering or explanation. Keep each quer
425
453
  def _get_previews(self, query: str) -> List[Dict[str, Any]]:
426
454
  """
427
455
  Get preview information for Semantic Scholar papers.
428
-
456
+
429
457
  Args:
430
458
  query: The search query
431
-
459
+
432
460
  Returns:
433
461
  List of preview dictionaries
434
462
  """
435
463
  logger.info(f"Getting Semantic Scholar previews for query: {query}")
436
-
464
+
437
465
  # Optimize the query if LLM is available
438
466
  optimized_query = self._optimize_query(query)
439
-
467
+
440
468
  # Use the adaptive search approach
441
469
  papers, strategy = self._adaptive_search(optimized_query)
442
-
470
+
443
471
  if not papers:
444
- logger.warning(f"No Semantic Scholar results found")
472
+ logger.warning("No Semantic Scholar results found")
445
473
  return []
446
-
474
+
447
475
  # Format as previews
448
476
  previews = []
449
477
  for paper in papers:
@@ -451,28 +479,34 @@ Format each query on a new line with no numbering or explanation. Keep each quer
451
479
  # Format authors - ensure we have a valid list with string values
452
480
  authors = []
453
481
  if "authors" in paper and paper["authors"]:
454
- authors = [author.get("name", "") for author in paper["authors"] if author and author.get("name")]
455
-
482
+ authors = [
483
+ author.get("name", "")
484
+ for author in paper["authors"]
485
+ if author and author.get("name")
486
+ ]
487
+
456
488
  # Ensure we have valid strings for all fields
457
489
  paper_id = paper.get("paperId", "")
458
490
  title = paper.get("title", "")
459
491
  url = paper.get("url", "")
460
-
492
+
461
493
  # Handle abstract safely, ensuring we always have a string
462
494
  abstract = paper.get("abstract")
463
495
  snippet = ""
464
496
  if abstract:
465
- snippet = abstract[:250] + "..." if len(abstract) > 250 else abstract
466
-
497
+ snippet = (
498
+ abstract[:250] + "..." if len(abstract) > 250 else abstract
499
+ )
500
+
467
501
  venue = paper.get("venue", "")
468
502
  year = paper.get("year")
469
503
  external_ids = paper.get("externalIds", {})
470
-
504
+
471
505
  # Handle TLDR safely
472
506
  tldr_text = ""
473
507
  if paper.get("tldr") and isinstance(paper.get("tldr"), dict):
474
508
  tldr_text = paper.get("tldr", {}).get("text", "")
475
-
509
+
476
510
  # Create preview with basic information, ensuring no None values
477
511
  preview = {
478
512
  "id": paper_id if paper_id else "",
@@ -486,76 +520,85 @@ Format each query on a new line with no numbering or explanation. Keep each quer
486
520
  "source": "Semantic Scholar",
487
521
  "_paper_id": paper_id if paper_id else "",
488
522
  "_search_strategy": strategy,
489
- "tldr": tldr_text
523
+ "tldr": tldr_text,
490
524
  }
491
-
525
+
492
526
  # Store the full paper object for later reference
493
527
  preview["_full_paper"] = paper
494
-
528
+
495
529
  previews.append(preview)
496
530
  except Exception as e:
497
531
  logger.error(f"Error processing paper preview: {e}")
498
532
  # Continue with the next paper
499
-
533
+
500
534
  # Sort by year (newer first) if available
501
535
  previews = sorted(
502
536
  previews,
503
537
  key=lambda p: p.get("year", 0) if p.get("year") is not None else 0,
504
- reverse=True
538
+ reverse=True,
539
+ )
540
+
541
+ logger.info(
542
+ f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}"
505
543
  )
506
-
507
- logger.info(f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}")
508
544
  return previews
509
-
510
- def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
545
+
546
+ def _get_full_content(
547
+ self, relevant_items: List[Dict[str, Any]]
548
+ ) -> List[Dict[str, Any]]:
511
549
  """
512
550
  Get full content for the relevant Semantic Scholar papers.
513
551
  Gets additional details like citations, references, and full metadata.
514
-
552
+
515
553
  Args:
516
554
  relevant_items: List of relevant preview dictionaries
517
-
555
+
518
556
  Returns:
519
557
  List of result dictionaries with full content
520
558
  """
521
559
  # Check if we should add full content
522
- if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
560
+ if (
561
+ hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
562
+ and search_config.SEARCH_SNIPPETS_ONLY
563
+ ):
523
564
  logger.info("Snippet-only mode, skipping full content retrieval")
524
565
  return relevant_items
525
-
526
- logger.info(f"Getting content for {len(relevant_items)} Semantic Scholar papers")
527
-
566
+
567
+ logger.info(
568
+ f"Getting content for {len(relevant_items)} Semantic Scholar papers"
569
+ )
570
+
528
571
  results = []
529
572
  for item in relevant_items:
530
573
  result = item.copy()
531
574
  paper_id = item.get("_paper_id", "")
532
-
575
+
533
576
  # Skip if no paper ID
534
577
  if not paper_id:
535
578
  results.append(result)
536
579
  continue
537
-
580
+
538
581
  # Get paper details if citations or references are requested
539
582
  if self.get_citations or self.get_references or self.get_embeddings:
540
583
  paper_details = self._get_paper_details(paper_id)
541
-
584
+
542
585
  if paper_details:
543
586
  # Add citation information
544
587
  if self.get_citations and "citations" in paper_details:
545
588
  result["citations"] = paper_details["citations"]
546
-
589
+
547
590
  # Add reference information
548
591
  if self.get_references and "references" in paper_details:
549
592
  result["references"] = paper_details["references"]
550
-
593
+
551
594
  # Add embedding if available
552
595
  if self.get_embeddings and "embedding" in paper_details:
553
596
  result["embedding"] = paper_details["embedding"]
554
-
597
+
555
598
  # Add fields of study
556
599
  if "fieldsOfStudy" in paper_details:
557
600
  result["fields_of_study"] = paper_details["fieldsOfStudy"]
558
-
601
+
559
602
  # Remove temporary fields
560
603
  if "_paper_id" in result:
561
604
  del result["_paper_id"]
@@ -563,7 +606,7 @@ Format each query on a new line with no numbering or explanation. Keep each quer
563
606
  del result["_search_strategy"]
564
607
  if "_full_paper" in result:
565
608
  del result["_full_paper"]
566
-
609
+
567
610
  results.append(result)
568
-
569
- return results
611
+
612
+ return results