local-deep-research 0.1.15__tar.gz → 0.1.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {local_deep_research-0.1.15/src/local_deep_research.egg-info → local_deep_research-0.1.16}/PKG-INFO +1 -1
  2. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/pyproject.toml +1 -1
  3. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/defaults/search_engines.toml +2 -2
  4. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/search_system.py +8 -9
  5. local_deep_research-0.1.16/src/local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +569 -0
  6. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/search_engine_base.py +5 -14
  7. {local_deep_research-0.1.15 → local_deep_research-0.1.16/src/local_deep_research.egg-info}/PKG-INFO +1 -1
  8. local_deep_research-0.1.15/src/local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +0 -1128
  9. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/LICENSE +0 -0
  10. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/MANIFEST.in +0 -0
  11. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/README.md +0 -0
  12. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/requirements.txt +0 -0
  13. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/setup.cfg +0 -0
  14. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/__init__.py +0 -0
  15. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/citation_handler.py +0 -0
  16. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/config.py +0 -0
  17. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/defaults/__init__.py +0 -0
  18. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/defaults/llm_config.py +0 -0
  19. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/defaults/local_collections.toml +0 -0
  20. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/defaults/main.toml +0 -0
  21. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/local_collections.py +0 -0
  22. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/main.py +0 -0
  23. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/report_generator.py +0 -0
  24. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/utilties/__init__.py +0 -0
  25. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/utilties/enums.py +0 -0
  26. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/utilties/llm_utils.py +0 -0
  27. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/utilties/search_utilities.py +0 -0
  28. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/utilties/setup_utils.py +0 -0
  29. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/__init__.py +0 -0
  30. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/app.py +0 -0
  31. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/static/css/styles.css +0 -0
  32. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/static/js/app.js +0 -0
  33. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/templates/api_keys_config.html +0 -0
  34. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/templates/collections_config.html +0 -0
  35. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/templates/index.html +0 -0
  36. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/templates/llm_config.html +0 -0
  37. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/templates/main_config.html +0 -0
  38. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/templates/search_engines_config.html +0 -0
  39. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/templates/settings.html +0 -0
  40. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web/templates/settings_dashboard.html +0 -0
  41. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/__init__.py +0 -0
  42. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/__init__.py +0 -0
  43. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/full_search.py +0 -0
  44. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/meta_search_engine.py +0 -0
  45. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_arxiv.py +0 -0
  46. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_brave.py +0 -0
  47. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_ddg.py +0 -0
  48. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_github.py +0 -0
  49. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_google_pse.py +0 -0
  50. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_guardian.py +0 -0
  51. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_local.py +0 -0
  52. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_local_all.py +0 -0
  53. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_pubmed.py +0 -0
  54. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_searxng.py +0 -0
  55. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_serpapi.py +0 -0
  56. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_wayback.py +0 -0
  57. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +0 -0
  58. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/full_search.py +0 -0
  59. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/search_engine_factory.py +0 -0
  60. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research/web_search_engines/search_engines_config.py +0 -0
  61. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research.egg-info/SOURCES.txt +0 -0
  62. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research.egg-info/dependency_links.txt +0 -0
  63. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research.egg-info/entry_points.txt +0 -0
  64. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research.egg-info/requires.txt +0 -0
  65. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/src/local_deep_research.egg-info/top_level.txt +0 -0
  66. {local_deep_research-0.1.15 → local_deep_research-0.1.16}/tests/test_google_pse.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: local-deep-research
3
- Version: 0.1.15
3
+ Version: 0.1.16
4
4
  Summary: AI-powered research assistant with deep, iterative analysis using LLMs and web searches
5
5
  Author-email: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>, HashedViking <6432677+HashedViking@users.noreply.github.com>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "local-deep-research"
7
- version = "0.1.15"
7
+ version = "0.1.16"
8
8
  description = "AI-powered research assistant with deep, iterative analysis using LLMs and web searches"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -37,7 +37,7 @@ module_path = "local_deep_research.web_search_engines.engines.search_engine_pubm
37
37
  class_name = "PubMedSearchEngine"
38
38
  requires_api_key = false
39
39
  api_key_env = "NCBI_API_KEY"
40
- reliability = 0.95
40
+ reliability = 0.98
41
41
  strengths = [
42
42
  "biomedical literature", "medical research", "clinical studies",
43
43
  "life sciences", "health information", "scientific papers"
@@ -191,7 +191,7 @@ module_path = "local_deep_research.web_search_engines.engines.search_engine_sema
191
191
  class_name = "SemanticScholarSearchEngine"
192
192
  requires_api_key = false
193
193
  api_key_env = "S2_API_KEY"
194
- reliability = 0.95
194
+ reliability = 0.87
195
195
  strengths = [
196
196
  "comprehensive scientific literature",
197
197
  "extensive citation network",
@@ -28,7 +28,7 @@ class AdvancedSearchSystem:
28
28
 
29
29
  # Check if search is available, log warning if not
30
30
  if self.search is None:
31
- print("WARNING: Search system initialized with no search engine! Research will not be effective.")
31
+ logger.info("WARNING: Search system initialized with no search engine! Research will not be effective.")
32
32
  self._update_progress("WARNING: No search engine available", None, {"error": "No search engine configured properly"})
33
33
 
34
34
 
@@ -101,7 +101,7 @@ class AdvancedSearchSystem:
101
101
  self._update_progress("Knowledge compression complete", None)
102
102
  response = remove_think_tags(response.content)
103
103
  response = str(response) #+ "\n\n" + str(formatted_links)
104
- print(response)
104
+
105
105
  return response
106
106
 
107
107
  def analyze_topic(self, query: str) -> Dict:
@@ -165,7 +165,7 @@ class AdvancedSearchSystem:
165
165
  search_results = self.search.run(question)
166
166
  except Exception as e:
167
167
  error_msg = f"Error during search: {str(e)}"
168
- print(f"SEARCH ERROR: {error_msg}")
168
+ logger.info(f"SEARCH ERROR: {error_msg}")
169
169
  self._update_progress(error_msg,
170
170
  int(question_progress_base + 2),
171
171
  {"phase": "search_error", "error": str(e)})
@@ -190,7 +190,7 @@ class AdvancedSearchSystem:
190
190
  self._update_progress(f"Analyzing results for: {question}",
191
191
  int(question_progress_base + 5),
192
192
  {"phase": "analysis"})
193
- print("NR OF SOURCES: ", len(self.all_links_of_system))
193
+
194
194
 
195
195
  try:
196
196
  result = self.citation_handler.analyze_followup(
@@ -203,7 +203,7 @@ class AdvancedSearchSystem:
203
203
  if links:
204
204
  formatted_links=format_links(links=links)
205
205
 
206
- logger.debug(f"Generated questions: {formatted_links}")
206
+ logger.info(f"Generated questions: {formatted_links}")
207
207
  if result is not None:
208
208
  results_with_links = str(result["content"])
209
209
  findings.append(
@@ -219,7 +219,6 @@ class AdvancedSearchSystem:
219
219
  if settings.general.knowledge_accumulation != str(KnowledgeAccumulationApproach.NO_KNOWLEDGE.value):
220
220
  current_knowledge = current_knowledge + "\n\n\n New: \n" + results_with_links
221
221
 
222
- logger.info(settings.general.knowledge_accumulation)
223
222
  if settings.general.knowledge_accumulation == str(KnowledgeAccumulationApproach.QUESTION.value):
224
223
  logger.info("Compressing knowledge")
225
224
  self._update_progress(f"Compress Knowledge for: {question}",
@@ -232,7 +231,7 @@ class AdvancedSearchSystem:
232
231
  {"phase": "analysis_complete"})
233
232
  except Exception as e:
234
233
  error_msg = f"Error analyzing results: {str(e)}"
235
- print(f"ANALYSIS ERROR: {error_msg}")
234
+ logger.info(f"ANALYSIS ERROR: {error_msg}")
236
235
  self._update_progress(error_msg,
237
236
  int(question_progress_base + 10),
238
237
  {"phase": "analysis_error", "error": str(e)})
@@ -251,7 +250,7 @@ class AdvancedSearchSystem:
251
250
  logger.info("FINISHED ITERATION - Compressing Knowledge")
252
251
  except Exception as e:
253
252
  error_msg = f"Error compressing knowledge: {str(e)}"
254
- print(f"COMPRESSION ERROR: {error_msg}")
253
+ logger.info(f"COMPRESSION ERROR: {error_msg}")
255
254
  self._update_progress(error_msg,
256
255
  int((iteration / total_iterations) * 100 - 3),
257
256
  {"phase": "compression_error", "error": str(e)})
@@ -266,7 +265,7 @@ class AdvancedSearchSystem:
266
265
  formatted_findings = self._save_findings(findings, current_knowledge, query)
267
266
  except Exception as e:
268
267
  error_msg = f"Error saving findings: {str(e)}"
269
- print(f"SAVE ERROR: {error_msg}")
268
+ logger.info(f"SAVE ERROR: {error_msg}")
270
269
  self._update_progress(error_msg,
271
270
  int((iteration / total_iterations) * 100),
272
271
  {"phase": "save_error", "error": str(e)})
@@ -0,0 +1,569 @@
1
+ import requests
2
+ import logging
3
+ import json
4
+ from typing import Dict, List, Any, Optional, Tuple, Union
5
+ from langchain_core.language_models import BaseLLM
6
+ import time
7
+ import re
8
+ from datetime import datetime
9
+ from requests.adapters import HTTPAdapter
10
+ from urllib3.util import Retry
11
+
12
+ from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
13
+ from local_deep_research import config
14
+
15
+ # Setup logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class SemanticScholarSearchEngine(BaseSearchEngine):
20
+ """
21
+ Semantic Scholar search engine implementation with two-phase approach.
22
+ Provides efficient access to scientific literature across all fields.
23
+ """
24
+
25
+ def __init__(self,
26
+ max_results: int = 10,
27
+ api_key: Optional[str] = None,
28
+ year_range: Optional[Tuple[int, int]] = None,
29
+ get_abstracts: bool = True,
30
+ get_references: bool = False,
31
+ get_citations: bool = False,
32
+ get_embeddings: bool = False,
33
+ get_tldr: bool = True,
34
+ citation_limit: int = 10,
35
+ reference_limit: int = 10,
36
+ llm: Optional[BaseLLM] = None,
37
+ max_filtered_results: Optional[int] = None,
38
+ optimize_queries: bool = True,
39
+ max_retries: int = 5,
40
+ retry_backoff_factor: float = 1.0,
41
+ fields_of_study: Optional[List[str]] = None,
42
+ publication_types: Optional[List[str]] = None):
43
+ """
44
+ Initialize the Semantic Scholar search engine.
45
+
46
+ Args:
47
+ max_results: Maximum number of search results
48
+ api_key: Semantic Scholar API key for higher rate limits (optional)
49
+ year_range: Optional tuple of (start_year, end_year) to filter results
50
+ get_abstracts: Whether to fetch abstracts for all results
51
+ get_references: Whether to fetch references for papers
52
+ get_citations: Whether to fetch citations for papers
53
+ get_embeddings: Whether to fetch SPECTER embeddings for papers
54
+ get_tldr: Whether to fetch TLDR summaries for papers
55
+ citation_limit: Maximum number of citations to fetch per paper
56
+ reference_limit: Maximum number of references to fetch per paper
57
+ llm: Language model for relevance filtering
58
+ max_filtered_results: Maximum number of results to keep after filtering
59
+ optimize_queries: Whether to optimize natural language queries
60
+ max_retries: Maximum number of retries for API requests
61
+ retry_backoff_factor: Backoff factor for retries
62
+ fields_of_study: List of fields of study to filter results
63
+ publication_types: List of publication types to filter results
64
+ """
65
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
66
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
67
+
68
+ self.api_key = api_key
69
+ self.year_range = year_range
70
+ self.get_abstracts = get_abstracts
71
+ self.get_references = get_references
72
+ self.get_citations = get_citations
73
+ self.get_embeddings = get_embeddings
74
+ self.get_tldr = get_tldr
75
+ self.citation_limit = citation_limit
76
+ self.reference_limit = reference_limit
77
+ self.optimize_queries = optimize_queries
78
+ self.max_retries = max_retries
79
+ self.retry_backoff_factor = retry_backoff_factor
80
+ self.fields_of_study = fields_of_study
81
+ self.publication_types = publication_types
82
+
83
+ # Base API URLs
84
+ self.base_url = "https://api.semanticscholar.org/graph/v1"
85
+ self.paper_search_url = f"{self.base_url}/paper/search"
86
+ self.paper_details_url = f"{self.base_url}/paper"
87
+
88
+ # Create a session with retry capabilities
89
+ self.session = self._create_session()
90
+
91
+ # Rate limiting
92
+ self.rate_limit_wait = 1.0 # Default 1 second between requests
93
+ self.last_request_time = 0
94
+
95
+ def _create_session(self) -> requests.Session:
96
+ """Create and configure a requests session with retry capabilities"""
97
+ session = requests.Session()
98
+
99
+ # Configure automatic retries with exponential backoff
100
+ retry_strategy = Retry(
101
+ total=self.max_retries,
102
+ backoff_factor=self.retry_backoff_factor,
103
+ status_forcelist=[429, 500, 502, 503, 504],
104
+ allowed_methods={"HEAD", "GET", "POST", "OPTIONS"}
105
+ )
106
+
107
+ adapter = HTTPAdapter(max_retries=retry_strategy)
108
+ session.mount("https://", adapter)
109
+
110
+ # Set up headers
111
+ headers = {"Accept": "application/json"}
112
+ if self.api_key:
113
+ headers["x-api-key"] = self.api_key
114
+
115
+ session.headers.update(headers)
116
+
117
+ return session
118
+
119
+ def _respect_rate_limit(self):
120
+ """Apply rate limiting between requests"""
121
+ current_time = time.time()
122
+ elapsed = current_time - self.last_request_time
123
+
124
+ if elapsed < self.rate_limit_wait:
125
+ wait_time = self.rate_limit_wait - elapsed
126
+ logger.debug(f"Rate limiting: waiting {wait_time:.2f}s")
127
+ time.sleep(wait_time)
128
+
129
+ self.last_request_time = time.time()
130
+
131
+ def _make_request(self, url: str, params: Optional[Dict] = None, data: Optional[Dict] = None,
132
+ method: str = "GET") -> Dict:
133
+ """
134
+ Make a request to the Semantic Scholar API.
135
+
136
+ Args:
137
+ url: API endpoint URL
138
+ params: Query parameters
139
+ data: JSON data for POST requests
140
+ method: HTTP method (GET or POST)
141
+
142
+ Returns:
143
+ API response as dictionary
144
+ """
145
+ self._respect_rate_limit()
146
+
147
+ try:
148
+ if method.upper() == "GET":
149
+ response = self.session.get(url, params=params, timeout=30)
150
+ elif method.upper() == "POST":
151
+ response = self.session.post(url, params=params, json=data, timeout=30)
152
+ else:
153
+ raise ValueError(f"Unsupported HTTP method: {method}")
154
+
155
+ # Handle rate limiting manually if retry strategy fails
156
+ if response.status_code == 429:
157
+ logger.warning("Rate limit exceeded, waiting and retrying...")
158
+ time.sleep(2.0) # Wait longer on rate limit
159
+ self.rate_limit_wait *= 1.5 # Increase wait time for future requests
160
+ return self._make_request(url, params, data, method) # Retry
161
+
162
+ response.raise_for_status()
163
+ return response.json()
164
+ except requests.RequestException as e:
165
+ logger.error(f"API request failed: {e}")
166
+ return {}
167
+
168
+ def _optimize_query(self, query: str) -> str:
169
+ """
170
+ Optimize a natural language query for Semantic Scholar search.
171
+ If LLM is available, uses it to extract key terms and concepts.
172
+
173
+ Args:
174
+ query: Natural language query
175
+
176
+ Returns:
177
+ Optimized query string
178
+ """
179
+ if not self.llm or not self.optimize_queries:
180
+ return query
181
+
182
+ try:
183
+ prompt = f"""Transform this natural language question into an optimized academic search query.
184
+
185
+ Original query: "{query}"
186
+
187
+ INSTRUCTIONS:
188
+ 1. Extract key academic concepts, technical terms, and proper nouns
189
+ 2. Remove generic words, filler words, and non-technical terms
190
+ 3. Add quotation marks around specific phrases that should be kept together
191
+ 4. Return ONLY the optimized search query with no explanation
192
+ 5. Keep it under 100 characters if possible
193
+
194
+ EXAMPLE TRANSFORMATIONS:
195
+ "What are the latest findings about mRNA vaccines and COVID-19?" → "mRNA vaccines COVID-19 recent findings"
196
+ "How does machine learning impact climate change prediction?" → "machine learning "climate change" prediction"
197
+ "Tell me about quantum computing approaches for encryption" → "quantum computing encryption"
198
+
199
+ Return ONLY the optimized search query with no explanation.
200
+ """
201
+
202
+ response = self.llm.invoke(prompt)
203
+ optimized_query = response.content.strip()
204
+
205
+ # Clean up the query - remove any explanations
206
+ lines = optimized_query.split('\n')
207
+ optimized_query = lines[0].strip()
208
+
209
+ # Safety check - if query looks too much like an explanation, use original
210
+ if len(optimized_query.split()) > 15 or ":" in optimized_query:
211
+ logger.warning("Query optimization result looks too verbose, using original")
212
+ return query
213
+
214
+ logger.info(f"Original query: '{query}'")
215
+ logger.info(f"Optimized for search: '{optimized_query}'")
216
+
217
+ return optimized_query
218
+ except Exception as e:
219
+ logger.error(f"Error optimizing query: {e}")
220
+ return query # Fall back to original query on error
221
+
222
+ def _direct_search(self, query: str) -> List[Dict[str, Any]]:
223
+ """
224
+ Make a direct search request to the Semantic Scholar API.
225
+
226
+ Args:
227
+ query: The search query
228
+
229
+ Returns:
230
+ List of paper dictionaries
231
+ """
232
+ try:
233
+ # Configure fields to retrieve
234
+ fields = [
235
+ "paperId",
236
+ "externalIds",
237
+ "url",
238
+ "title",
239
+ "abstract",
240
+ "venue",
241
+ "year",
242
+ "authors"
243
+ ]
244
+
245
+ if self.get_tldr:
246
+ fields.append("tldr")
247
+
248
+ params = {
249
+ "query": query,
250
+ "limit": min(self.max_results, 100), # API limit is 100 per request
251
+ "fields": ",".join(fields)
252
+ }
253
+
254
+ # Add year filter if specified
255
+ if self.year_range:
256
+ start_year, end_year = self.year_range
257
+ params["year"] = f"{start_year}-{end_year}"
258
+
259
+ # Add fields of study filter if specified
260
+ if self.fields_of_study:
261
+ params["fieldsOfStudy"] = ",".join(self.fields_of_study)
262
+
263
+ # Add publication types filter if specified
264
+ if self.publication_types:
265
+ params["publicationTypes"] = ",".join(self.publication_types)
266
+
267
+ response = self._make_request(self.paper_search_url, params)
268
+
269
+ if "data" in response:
270
+ papers = response["data"]
271
+ logger.info(f"Found {len(papers)} papers with direct search for query: '{query}'")
272
+ return papers
273
+ else:
274
+ logger.warning(f"No data in response for direct search query: '{query}'")
275
+ return []
276
+
277
+ except Exception as e:
278
+ logger.error(f"Error in direct search: {e}")
279
+ return []
280
+
281
+ def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
282
+ """
283
+ Perform an adaptive search that adjusts based on result volume.
284
+ Uses LLM to generate better fallback queries when available.
285
+
286
+ Args:
287
+ query: The search query
288
+
289
+ Returns:
290
+ Tuple of (list of paper results, search strategy used)
291
+ """
292
+ # Start with a standard search
293
+ papers = self._direct_search(query)
294
+ strategy = "standard"
295
+
296
+ # If no results, try different variations
297
+ if not papers:
298
+ # Try removing quotes to broaden search
299
+ if '"' in query:
300
+ unquoted_query = query.replace('"', '')
301
+ logger.info(f"No results with quoted terms, trying without quotes: {unquoted_query}")
302
+ papers = self._direct_search(unquoted_query)
303
+
304
+ if papers:
305
+ strategy = "unquoted"
306
+ return papers, strategy
307
+
308
+ # If LLM is available, use it to generate better fallback queries
309
+ if self.llm:
310
+ try:
311
+ # Generate alternate search queries focusing on core concepts
312
+ prompt = f"""You are helping refine a search query that returned no results.
313
+
314
+ Original query: "{query}"
315
+
316
+ The query might be too specific or use natural language phrasing that doesn't match academic paper keywords.
317
+
318
+ Please provide THREE alternative search queries that:
319
+ 1. Focus on the core academic concepts
320
+ 2. Use precise terminology commonly found in academic papers
321
+ 3. Break down complex queries into more searchable components
322
+ 4. Format each as a concise keyword-focused search term (not a natural language question)
323
+
324
+ Format each query on a new line with no numbering or explanation. Keep each query under 8 words and very focused.
325
+ """
326
+ # Get the LLM's response
327
+ response = self.llm.invoke(prompt)
328
+
329
+ # Extract the alternative queries
330
+ alt_queries = []
331
+ if hasattr(response, 'content'): # Handle various LLM response formats
332
+ content = response.content
333
+ alt_queries = [q.strip() for q in content.strip().split('\n') if q.strip()]
334
+ elif isinstance(response, str):
335
+ alt_queries = [q.strip() for q in response.strip().split('\n') if q.strip()]
336
+
337
+ # Try each alternative query
338
+ for alt_query in alt_queries[:3]: # Limit to first 3 alternatives
339
+ logger.info(f"Trying LLM-suggested query: {alt_query}")
340
+ alt_papers = self._direct_search(alt_query)
341
+
342
+ if alt_papers:
343
+ logger.info(f"Found {len(alt_papers)} papers using LLM-suggested query: {alt_query}")
344
+ strategy = "llm_alternative"
345
+ return alt_papers, strategy
346
+ except Exception as e:
347
+ logger.error(f"Error using LLM for query refinement: {e}")
348
+ # Fall through to simpler strategies
349
+
350
+ # Fallback: Try with the longest words (likely specific terms)
351
+ words = re.findall(r'\w+', query)
352
+ longer_words = [word for word in words if len(word) > 6]
353
+ if longer_words:
354
+ # Use up to 3 of the longest words
355
+ longer_words = sorted(longer_words, key=len, reverse=True)[:3]
356
+ key_terms_query = ' '.join(longer_words)
357
+ logger.info(f"Trying with key terms: {key_terms_query}")
358
+ papers = self._direct_search(key_terms_query)
359
+
360
+ if papers:
361
+ strategy = "key_terms"
362
+ return papers, strategy
363
+
364
+ # Final fallback: Try with just the longest word
365
+ if words:
366
+ longest_word = max(words, key=len)
367
+ if len(longest_word) > 5: # Only use if it's reasonably long
368
+ logger.info(f"Trying with single key term: {longest_word}")
369
+ papers = self._direct_search(longest_word)
370
+
371
+ if papers:
372
+ strategy = "single_term"
373
+ return papers, strategy
374
+
375
+ return papers, strategy
376
+
377
+ def _get_paper_details(self, paper_id: str) -> Dict[str, Any]:
378
+ """
379
+ Get detailed information about a specific paper.
380
+
381
+ Args:
382
+ paper_id: Semantic Scholar Paper ID
383
+
384
+ Returns:
385
+ Dictionary with paper details
386
+ """
387
+ try:
388
+ # Construct fields parameter
389
+ fields = [
390
+ "paperId",
391
+ "externalIds",
392
+ "corpusId",
393
+ "url",
394
+ "title",
395
+ "abstract",
396
+ "venue",
397
+ "year",
398
+ "authors",
399
+ "fieldsOfStudy"
400
+ ]
401
+
402
+ if self.get_tldr:
403
+ fields.append("tldr")
404
+
405
+ if self.get_embeddings:
406
+ fields.append("embedding")
407
+
408
+ # Add citation and reference fields if requested
409
+ if self.get_citations:
410
+ fields.append(f"citations.limit({self.citation_limit})")
411
+
412
+ if self.get_references:
413
+ fields.append(f"references.limit({self.reference_limit})")
414
+
415
+ # Make the request
416
+ url = f"{self.paper_details_url}/{paper_id}"
417
+ params = {"fields": ",".join(fields)}
418
+
419
+ return self._make_request(url, params)
420
+
421
+ except Exception as e:
422
+ logger.error(f"Error getting paper details for {paper_id}: {e}")
423
+ return {}
424
+
425
+ def _get_previews(self, query: str) -> List[Dict[str, Any]]:
426
+ """
427
+ Get preview information for Semantic Scholar papers.
428
+
429
+ Args:
430
+ query: The search query
431
+
432
+ Returns:
433
+ List of preview dictionaries
434
+ """
435
+ logger.info(f"Getting Semantic Scholar previews for query: {query}")
436
+
437
+ # Optimize the query if LLM is available
438
+ optimized_query = self._optimize_query(query)
439
+
440
+ # Use the adaptive search approach
441
+ papers, strategy = self._adaptive_search(optimized_query)
442
+
443
+ if not papers:
444
+ logger.warning(f"No Semantic Scholar results found")
445
+ return []
446
+
447
+ # Format as previews
448
+ previews = []
449
+ for paper in papers:
450
+ try:
451
+ # Format authors - ensure we have a valid list with string values
452
+ authors = []
453
+ if "authors" in paper and paper["authors"]:
454
+ authors = [author.get("name", "") for author in paper["authors"] if author and author.get("name")]
455
+
456
+ # Ensure we have valid strings for all fields
457
+ paper_id = paper.get("paperId", "")
458
+ title = paper.get("title", "")
459
+ url = paper.get("url", "")
460
+
461
+ # Handle abstract safely, ensuring we always have a string
462
+ abstract = paper.get("abstract")
463
+ snippet = ""
464
+ if abstract:
465
+ snippet = abstract[:250] + "..." if len(abstract) > 250 else abstract
466
+
467
+ venue = paper.get("venue", "")
468
+ year = paper.get("year")
469
+ external_ids = paper.get("externalIds", {})
470
+
471
+ # Handle TLDR safely
472
+ tldr_text = ""
473
+ if paper.get("tldr") and isinstance(paper.get("tldr"), dict):
474
+ tldr_text = paper.get("tldr", {}).get("text", "")
475
+
476
+ # Create preview with basic information, ensuring no None values
477
+ preview = {
478
+ "id": paper_id if paper_id else "",
479
+ "title": title if title else "",
480
+ "link": url if url else "",
481
+ "snippet": snippet,
482
+ "authors": authors,
483
+ "venue": venue if venue else "",
484
+ "year": year,
485
+ "external_ids": external_ids if external_ids else {},
486
+ "source": "Semantic Scholar",
487
+ "_paper_id": paper_id if paper_id else "",
488
+ "_search_strategy": strategy,
489
+ "tldr": tldr_text
490
+ }
491
+
492
+ # Store the full paper object for later reference
493
+ preview["_full_paper"] = paper
494
+
495
+ previews.append(preview)
496
+ except Exception as e:
497
+ logger.error(f"Error processing paper preview: {e}")
498
+ # Continue with the next paper
499
+
500
+ # Sort by year (newer first) if available
501
+ previews = sorted(
502
+ previews,
503
+ key=lambda p: p.get("year", 0) if p.get("year") is not None else 0,
504
+ reverse=True
505
+ )
506
+
507
+ logger.info(f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}")
508
+ return previews
509
+
510
+ def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
511
+ """
512
+ Get full content for the relevant Semantic Scholar papers.
513
+ Gets additional details like citations, references, and full metadata.
514
+
515
+ Args:
516
+ relevant_items: List of relevant preview dictionaries
517
+
518
+ Returns:
519
+ List of result dictionaries with full content
520
+ """
521
+ # Check if we should add full content
522
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
523
+ logger.info("Snippet-only mode, skipping full content retrieval")
524
+ return relevant_items
525
+
526
+ logger.info(f"Getting content for {len(relevant_items)} Semantic Scholar papers")
527
+
528
+ results = []
529
+ for item in relevant_items:
530
+ result = item.copy()
531
+ paper_id = item.get("_paper_id", "")
532
+
533
+ # Skip if no paper ID
534
+ if not paper_id:
535
+ results.append(result)
536
+ continue
537
+
538
+ # Get paper details if citations or references are requested
539
+ if self.get_citations or self.get_references or self.get_embeddings:
540
+ paper_details = self._get_paper_details(paper_id)
541
+
542
+ if paper_details:
543
+ # Add citation information
544
+ if self.get_citations and "citations" in paper_details:
545
+ result["citations"] = paper_details["citations"]
546
+
547
+ # Add reference information
548
+ if self.get_references and "references" in paper_details:
549
+ result["references"] = paper_details["references"]
550
+
551
+ # Add embedding if available
552
+ if self.get_embeddings and "embedding" in paper_details:
553
+ result["embedding"] = paper_details["embedding"]
554
+
555
+ # Add fields of study
556
+ if "fieldsOfStudy" in paper_details:
557
+ result["fields_of_study"] = paper_details["fieldsOfStudy"]
558
+
559
+ # Remove temporary fields
560
+ if "_paper_id" in result:
561
+ del result["_paper_id"]
562
+ if "_search_strategy" in result:
563
+ del result["_search_strategy"]
564
+ if "_full_paper" in result:
565
+ del result["_full_paper"]
566
+
567
+ results.append(result)
568
+
569
+ return results