local-deep-research 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/config.py +108 -53
- local_deep_research/defaults/search_engines.toml +39 -18
- local_deep_research/search_system.py +16 -10
- local_deep_research/utilties/enums.py +4 -4
- local_deep_research/web/app.py +3 -2
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +3 -5
- local_deep_research/web_search_engines/engines/search_engine_brave.py +3 -5
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +4 -3
- local_deep_research/web_search_engines/engines/search_engine_github.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +323 -78
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +3 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +3 -4
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +3 -2
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +1128 -0
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +2 -4
- local_deep_research/web_search_engines/search_engine_base.py +12 -4
- {local_deep_research-0.1.12.dist-info → local_deep_research-0.1.14.dist-info}/METADATA +1 -1
- {local_deep_research-0.1.12.dist-info → local_deep_research-0.1.14.dist-info}/RECORD +25 -25
- local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +0 -623
- {local_deep_research-0.1.12.dist-info → local_deep_research-0.1.14.dist-info}/WHEEL +0 -0
- {local_deep_research-0.1.12.dist-info → local_deep_research-0.1.14.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.1.12.dist-info → local_deep_research-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {local_deep_research-0.1.12.dist-info → local_deep_research-0.1.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1128 @@
|
|
1
|
+
import requests
|
2
|
+
import logging
|
3
|
+
import json
|
4
|
+
from typing import Dict, List, Any, Optional, Tuple, Union
|
5
|
+
from langchain_core.language_models import BaseLLM
|
6
|
+
import time
|
7
|
+
import re
|
8
|
+
from datetime import datetime
|
9
|
+
from requests.adapters import HTTPAdapter
|
10
|
+
from urllib3.util import Retry
|
11
|
+
|
12
|
+
from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
|
13
|
+
from local_deep_research import config
|
14
|
+
|
15
|
+
# Setup logging
|
16
|
+
logging.basicConfig(level=logging.INFO)
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
class SemanticScholarSearchEngine(BaseSearchEngine):
|
20
|
+
"""
|
21
|
+
Semantic Scholar search engine implementation with two-phase approach.
|
22
|
+
Provides efficient access to scientific literature across all fields.
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self,
|
26
|
+
max_results: int = 10,
|
27
|
+
api_key: Optional[str] = None,
|
28
|
+
year_range: Optional[Tuple[int, int]] = None,
|
29
|
+
get_abstracts: bool = True,
|
30
|
+
get_references: bool = False,
|
31
|
+
get_citations: bool = False,
|
32
|
+
get_embeddings: bool = False,
|
33
|
+
get_tldr: bool = True,
|
34
|
+
citation_limit: int = 10,
|
35
|
+
reference_limit: int = 10,
|
36
|
+
llm: Optional[BaseLLM] = None,
|
37
|
+
max_filtered_results: Optional[int] = None,
|
38
|
+
optimize_queries: bool = True,
|
39
|
+
max_retries: int = 5,
|
40
|
+
retry_backoff_factor: float = 1.0,
|
41
|
+
fields_of_study: Optional[List[str]] = None,
|
42
|
+
publication_types: Optional[List[str]] = None):
|
43
|
+
"""
|
44
|
+
Initialize the Semantic Scholar search engine.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
max_results: Maximum number of search results
|
48
|
+
api_key: Semantic Scholar API key for higher rate limits (optional)
|
49
|
+
year_range: Optional tuple of (start_year, end_year) to filter results
|
50
|
+
get_abstracts: Whether to fetch abstracts for all results
|
51
|
+
get_references: Whether to fetch references for papers
|
52
|
+
get_citations: Whether to fetch citations for papers
|
53
|
+
get_embeddings: Whether to fetch SPECTER embeddings for papers
|
54
|
+
get_tldr: Whether to fetch TLDR summaries for papers
|
55
|
+
citation_limit: Maximum number of citations to fetch per paper
|
56
|
+
reference_limit: Maximum number of references to fetch per paper
|
57
|
+
llm: Language model for relevance filtering
|
58
|
+
max_filtered_results: Maximum number of results to keep after filtering
|
59
|
+
optimize_queries: Whether to optimize natural language queries
|
60
|
+
max_retries: Maximum number of retries for API requests
|
61
|
+
retry_backoff_factor: Backoff factor for retries
|
62
|
+
fields_of_study: List of fields of study to filter results
|
63
|
+
publication_types: List of publication types to filter results
|
64
|
+
"""
|
65
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
66
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
67
|
+
self.api_key = api_key
|
68
|
+
self.year_range = year_range
|
69
|
+
self.get_abstracts = get_abstracts
|
70
|
+
self.get_references = get_references
|
71
|
+
self.get_citations = get_citations
|
72
|
+
self.get_embeddings = get_embeddings
|
73
|
+
self.get_tldr = get_tldr
|
74
|
+
self.citation_limit = citation_limit
|
75
|
+
self.reference_limit = reference_limit
|
76
|
+
self.optimize_queries = optimize_queries
|
77
|
+
self.max_retries = max_retries
|
78
|
+
self.retry_backoff_factor = retry_backoff_factor
|
79
|
+
self.fields_of_study = fields_of_study
|
80
|
+
self.publication_types = publication_types
|
81
|
+
|
82
|
+
# Base API URLs
|
83
|
+
self.base_url = "https://api.semanticscholar.org/graph/v1"
|
84
|
+
self.paper_search_url = f"{self.base_url}/paper/search"
|
85
|
+
self.paper_bulk_search_url = f"{self.base_url}/paper/search/bulk"
|
86
|
+
self.paper_batch_url = f"{self.base_url}/paper/batch"
|
87
|
+
self.paper_details_url = f"{self.base_url}/paper"
|
88
|
+
self.author_search_url = f"{self.base_url}/author/search"
|
89
|
+
self.author_details_url = f"{self.base_url}/author"
|
90
|
+
self.recommendations_url = "https://api.semanticscholar.org/recommendations/v1/papers"
|
91
|
+
self.datasets_url = "https://api.semanticscholar.org/datasets/v1"
|
92
|
+
|
93
|
+
# Create a session with retry capabilities
|
94
|
+
self.session = self._create_session()
|
95
|
+
|
96
|
+
# Rate limiting
|
97
|
+
self.rate_limit_wait = 1.0 # Default 1 second between requests
|
98
|
+
self.last_request_time = 0
|
99
|
+
|
100
|
+
def _create_session(self) -> requests.Session:
|
101
|
+
"""Create and configure a requests session with retry capabilities"""
|
102
|
+
session = requests.Session()
|
103
|
+
|
104
|
+
# Configure automatic retries with exponential backoff
|
105
|
+
retry_strategy = Retry(
|
106
|
+
total=self.max_retries,
|
107
|
+
backoff_factor=self.retry_backoff_factor,
|
108
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
109
|
+
allowed_methods={"HEAD", "GET", "POST", "OPTIONS"}
|
110
|
+
)
|
111
|
+
|
112
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
113
|
+
session.mount("https://", adapter)
|
114
|
+
|
115
|
+
# Set up headers
|
116
|
+
headers = {"Accept": "application/json"}
|
117
|
+
if self.api_key:
|
118
|
+
headers["x-api-key"] = self.api_key
|
119
|
+
|
120
|
+
session.headers.update(headers)
|
121
|
+
|
122
|
+
return session
|
123
|
+
|
124
|
+
def _respect_rate_limit(self):
|
125
|
+
"""Apply rate limiting between requests"""
|
126
|
+
current_time = time.time()
|
127
|
+
elapsed = current_time - self.last_request_time
|
128
|
+
|
129
|
+
if elapsed < self.rate_limit_wait:
|
130
|
+
wait_time = self.rate_limit_wait - elapsed
|
131
|
+
logger.debug(f"Rate limiting: waiting {wait_time:.2f}s")
|
132
|
+
time.sleep(wait_time)
|
133
|
+
|
134
|
+
self.last_request_time = time.time()
|
135
|
+
|
136
|
+
def _get_headers(self) -> Dict[str, str]:
|
137
|
+
"""Get the headers for API requests"""
|
138
|
+
headers = {"Accept": "application/json"}
|
139
|
+
|
140
|
+
if self.api_key:
|
141
|
+
headers["x-api-key"] = self.api_key
|
142
|
+
|
143
|
+
return headers
|
144
|
+
|
145
|
+
def _make_request(self, url: str, params: Optional[Dict] = None, data: Optional[Dict] = None,
|
146
|
+
method: str = "GET") -> Dict:
|
147
|
+
"""
|
148
|
+
Make a request to the Semantic Scholar API.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
url: API endpoint URL
|
152
|
+
params: Query parameters
|
153
|
+
data: JSON data for POST requests
|
154
|
+
method: HTTP method (GET or POST)
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
API response as dictionary
|
158
|
+
"""
|
159
|
+
self._respect_rate_limit()
|
160
|
+
|
161
|
+
try:
|
162
|
+
if method.upper() == "GET":
|
163
|
+
response = self.session.get(url, params=params, timeout=30)
|
164
|
+
elif method.upper() == "POST":
|
165
|
+
response = self.session.post(url, params=params, json=data, timeout=30)
|
166
|
+
else:
|
167
|
+
raise ValueError(f"Unsupported HTTP method: {method}")
|
168
|
+
|
169
|
+
# Handle rate limiting manually if retry strategy fails
|
170
|
+
if response.status_code == 429:
|
171
|
+
logger.warning("Rate limit exceeded, waiting and retrying...")
|
172
|
+
time.sleep(2.0) # Wait longer on rate limit
|
173
|
+
self.rate_limit_wait *= 1.5 # Increase wait time for future requests
|
174
|
+
return self._make_request(url, params, data, method) # Retry
|
175
|
+
|
176
|
+
response.raise_for_status()
|
177
|
+
return response.json()
|
178
|
+
except requests.RequestException as e:
|
179
|
+
logger.error(f"API request failed: {e}")
|
180
|
+
return {}
|
181
|
+
|
182
|
+
def _optimize_query(self, query: str) -> str:
|
183
|
+
"""
|
184
|
+
Optimize a natural language query for Semantic Scholar search.
|
185
|
+
If LLM is available, uses it to extract key terms and concepts.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
query: Natural language query
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
Optimized query string
|
192
|
+
"""
|
193
|
+
if not self.llm or not self.optimize_queries:
|
194
|
+
return query
|
195
|
+
|
196
|
+
try:
|
197
|
+
prompt = f"""Transform this natural language question into an optimized academic search query.
|
198
|
+
|
199
|
+
Original query: "{query}"
|
200
|
+
|
201
|
+
INSTRUCTIONS:
|
202
|
+
1. Extract key academic concepts, technical terms, and proper nouns
|
203
|
+
2. Remove generic words, filler words, and non-technical terms
|
204
|
+
3. Add quotation marks around specific phrases that should be kept together
|
205
|
+
4. Return ONLY the optimized search query with no explanation
|
206
|
+
5. Keep it under 100 characters if possible
|
207
|
+
|
208
|
+
EXAMPLE TRANSFORMATIONS:
|
209
|
+
"What are the latest findings about mRNA vaccines and COVID-19?" → "mRNA vaccines COVID-19 recent findings"
|
210
|
+
"How does machine learning impact climate change prediction?" → "machine learning "climate change" prediction"
|
211
|
+
"Tell me about quantum computing approaches for encryption" → "quantum computing encryption"
|
212
|
+
|
213
|
+
Return ONLY the optimized search query with no explanation.
|
214
|
+
"""
|
215
|
+
|
216
|
+
response = self.llm.invoke(prompt)
|
217
|
+
optimized_query = response.content.strip()
|
218
|
+
|
219
|
+
# Clean up the query - remove any explanations
|
220
|
+
lines = optimized_query.split('\n')
|
221
|
+
optimized_query = lines[0].strip()
|
222
|
+
|
223
|
+
# Safety check - if query looks too much like an explanation, use original
|
224
|
+
if len(optimized_query.split()) > 15 or ":" in optimized_query:
|
225
|
+
logger.warning("Query optimization result looks too verbose, using original")
|
226
|
+
return query
|
227
|
+
|
228
|
+
logger.info(f"Original query: '{query}'")
|
229
|
+
logger.info(f"Optimized for Semantic Scholar: '{optimized_query}'")
|
230
|
+
|
231
|
+
return optimized_query
|
232
|
+
except Exception as e:
|
233
|
+
logger.error(f"Error optimizing query: {e}")
|
234
|
+
return query # Fall back to original query on error
|
235
|
+
|
236
|
+
def _search_papers(self, query: str) -> List[Dict[str, Any]]:
|
237
|
+
"""
|
238
|
+
Search for papers matching the query.
|
239
|
+
|
240
|
+
Args:
|
241
|
+
query: The search query
|
242
|
+
|
243
|
+
Returns:
|
244
|
+
List of paper dictionaries
|
245
|
+
"""
|
246
|
+
try:
|
247
|
+
fields = [
|
248
|
+
"paperId",
|
249
|
+
"externalIds",
|
250
|
+
"url",
|
251
|
+
"title",
|
252
|
+
"abstract",
|
253
|
+
"venue",
|
254
|
+
"year",
|
255
|
+
"authors"
|
256
|
+
]
|
257
|
+
|
258
|
+
if self.get_tldr:
|
259
|
+
fields.append("tldr")
|
260
|
+
|
261
|
+
params = {
|
262
|
+
"query": query,
|
263
|
+
"limit": min(self.max_results, 100), # Regular search API can return up to 100 results
|
264
|
+
"fields": ",".join(fields)
|
265
|
+
}
|
266
|
+
|
267
|
+
# Add year filter if specified
|
268
|
+
if self.year_range:
|
269
|
+
start_year, end_year = self.year_range
|
270
|
+
params["year"] = f"{start_year}-{end_year}"
|
271
|
+
|
272
|
+
# Add fields of study filter if specified
|
273
|
+
if self.fields_of_study:
|
274
|
+
params["fieldsOfStudy"] = ",".join(self.fields_of_study)
|
275
|
+
|
276
|
+
# Add publication types filter if specified
|
277
|
+
if self.publication_types:
|
278
|
+
params["publicationTypes"] = ",".join(self.publication_types)
|
279
|
+
|
280
|
+
response = self._make_request(self.paper_search_url, params)
|
281
|
+
|
282
|
+
if "data" in response:
|
283
|
+
papers = response["data"]
|
284
|
+
logger.info(f"Found {len(papers)} papers matching query: '{query}'")
|
285
|
+
return papers
|
286
|
+
else:
|
287
|
+
logger.warning(f"No data in response for query: '{query}'")
|
288
|
+
return []
|
289
|
+
|
290
|
+
except Exception as e:
|
291
|
+
logger.error(f"Error searching papers: {e}")
|
292
|
+
return []
|
293
|
+
|
294
|
+
def _search_papers_bulk(self, query: str, limit: int = 1000) -> List[Dict[str, Any]]:
|
295
|
+
"""
|
296
|
+
Search for papers using the bulk search API, which can return up to 1000 papers.
|
297
|
+
|
298
|
+
Args:
|
299
|
+
query: The search query
|
300
|
+
limit: Maximum number of results (up to 1000)
|
301
|
+
|
302
|
+
Returns:
|
303
|
+
List of paper dictionaries
|
304
|
+
"""
|
305
|
+
try:
|
306
|
+
fields = [
|
307
|
+
"paperId",
|
308
|
+
"externalIds",
|
309
|
+
"url",
|
310
|
+
"title",
|
311
|
+
"abstract",
|
312
|
+
"venue",
|
313
|
+
"year",
|
314
|
+
"authors",
|
315
|
+
"fieldsOfStudy"
|
316
|
+
]
|
317
|
+
|
318
|
+
if self.get_tldr:
|
319
|
+
fields.append("tldr")
|
320
|
+
|
321
|
+
params = {
|
322
|
+
"query": query,
|
323
|
+
"limit": min(limit, 1000), # Bulk search API can return up to 1000 results
|
324
|
+
"fields": ",".join(fields)
|
325
|
+
}
|
326
|
+
|
327
|
+
# Add year filter if specified
|
328
|
+
if self.year_range:
|
329
|
+
start_year, end_year = self.year_range
|
330
|
+
params["year"] = f"{start_year}-{end_year}"
|
331
|
+
|
332
|
+
# Add fields of study filter if specified
|
333
|
+
if self.fields_of_study:
|
334
|
+
params["fieldsOfStudy"] = ",".join(self.fields_of_study)
|
335
|
+
|
336
|
+
# Add publication types filter if specified
|
337
|
+
if self.publication_types:
|
338
|
+
params["publicationTypes"] = ",".join(self.publication_types)
|
339
|
+
|
340
|
+
response = self._make_request(self.paper_bulk_search_url, params)
|
341
|
+
|
342
|
+
if "data" in response:
|
343
|
+
papers = response["data"]
|
344
|
+
logger.info(f"Found {len(papers)} papers using bulk search for query: '{query}'")
|
345
|
+
total_count = response.get("total", 0)
|
346
|
+
logger.info(f"Total available results: {total_count}")
|
347
|
+
|
348
|
+
# Handle continuation token for pagination if needed
|
349
|
+
if "token" in response and len(papers) < min(total_count, limit):
|
350
|
+
token = response["token"]
|
351
|
+
logger.info(f"Continuation token available: {token}")
|
352
|
+
# The caller would need to handle continuation tokens for pagination
|
353
|
+
|
354
|
+
return papers
|
355
|
+
else:
|
356
|
+
logger.warning(f"No data in response for bulk query: '{query}'")
|
357
|
+
return []
|
358
|
+
|
359
|
+
except Exception as e:
|
360
|
+
logger.error(f"Error in bulk paper search: {e}")
|
361
|
+
return []
|
362
|
+
|
363
|
+
def _get_paper_details(self, paper_id: str) -> Dict[str, Any]:
|
364
|
+
"""
|
365
|
+
Get detailed information about a specific paper.
|
366
|
+
|
367
|
+
Args:
|
368
|
+
paper_id: Semantic Scholar Paper ID
|
369
|
+
|
370
|
+
Returns:
|
371
|
+
Dictionary with paper details
|
372
|
+
"""
|
373
|
+
try:
|
374
|
+
# Construct fields parameter
|
375
|
+
fields = [
|
376
|
+
"paperId",
|
377
|
+
"externalIds",
|
378
|
+
"corpusId",
|
379
|
+
"url",
|
380
|
+
"title",
|
381
|
+
"abstract",
|
382
|
+
"venue",
|
383
|
+
"year",
|
384
|
+
"authors",
|
385
|
+
"fieldsOfStudy"
|
386
|
+
]
|
387
|
+
|
388
|
+
if self.get_tldr:
|
389
|
+
fields.append("tldr")
|
390
|
+
|
391
|
+
if self.get_embeddings:
|
392
|
+
fields.append("embedding")
|
393
|
+
|
394
|
+
# Add citation and reference fields if requested
|
395
|
+
if self.get_citations:
|
396
|
+
fields.append(f"citations.limit({self.citation_limit})")
|
397
|
+
|
398
|
+
if self.get_references:
|
399
|
+
fields.append(f"references.limit({self.reference_limit})")
|
400
|
+
|
401
|
+
# Make the request
|
402
|
+
url = f"{self.paper_details_url}/{paper_id}"
|
403
|
+
params = {"fields": ",".join(fields)}
|
404
|
+
|
405
|
+
return self._make_request(url, params)
|
406
|
+
|
407
|
+
except Exception as e:
|
408
|
+
logger.error(f"Error getting paper details for {paper_id}: {e}")
|
409
|
+
return {}
|
410
|
+
|
411
|
+
|
412
|
+
def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
|
413
|
+
"""
|
414
|
+
Perform an adaptive search that adjusts based on result volume.
|
415
|
+
Uses LLM to generate better fallback queries when available.
|
416
|
+
|
417
|
+
Args:
|
418
|
+
query: The search query (already optimized)
|
419
|
+
|
420
|
+
Returns:
|
421
|
+
Tuple of (list of paper results, search strategy used)
|
422
|
+
"""
|
423
|
+
# Start with a standard search
|
424
|
+
papers = self._search_papers(query)
|
425
|
+
strategy = "standard"
|
426
|
+
|
427
|
+
# If no results, try different variations
|
428
|
+
if not papers:
|
429
|
+
# Try removing quotes to broaden search
|
430
|
+
if '"' in query:
|
431
|
+
unquoted_query = query.replace('"', '')
|
432
|
+
logger.info(f"No results with quoted terms, trying without quotes: {unquoted_query}")
|
433
|
+
papers = self._search_papers(unquoted_query)
|
434
|
+
|
435
|
+
if papers:
|
436
|
+
strategy = "unquoted"
|
437
|
+
return papers, strategy
|
438
|
+
|
439
|
+
# If LLM is available, use it to generate better fallback queries
|
440
|
+
if self.llm:
|
441
|
+
try:
|
442
|
+
# Generate alternate search queries focusing on core concepts
|
443
|
+
prompt = f"""You are helping refine a search query for academic papers related to cancer research that returned no results.
|
444
|
+
|
445
|
+
Original query: "{query}"
|
446
|
+
|
447
|
+
The query might be too specific, contain future dates, or use natural language phrasing that doesn't match academic paper keywords.
|
448
|
+
|
449
|
+
Please provide THREE alternative search queries that:
|
450
|
+
1. Focus on the core academic concepts about cancer treatment, research, or therapies
|
451
|
+
2. Remove future dates or references to "latest" or "current" (replace with terms like "recent" or "novel")
|
452
|
+
3. Use precise medical/scientific terminology commonly found in academic papers
|
453
|
+
4. Break down complex queries into more searchable components
|
454
|
+
5. Format each as a concise keyword-focused search term (not a natural language question)
|
455
|
+
|
456
|
+
Format each query on a new line with no numbering or explanation. Keep each query under 8 words and very focused.
|
457
|
+
"""
|
458
|
+
# Get the LLM's response
|
459
|
+
response = self.llm.invoke(prompt)
|
460
|
+
|
461
|
+
# Extract the alternative queries
|
462
|
+
alt_queries = []
|
463
|
+
if hasattr(response, 'content'): # Handle various LLM response formats
|
464
|
+
content = response.content
|
465
|
+
alt_queries = [q.strip() for q in content.strip().split('\n') if q.strip()]
|
466
|
+
elif isinstance(response, str):
|
467
|
+
alt_queries = [q.strip() for q in response.strip().split('\n') if q.strip()]
|
468
|
+
|
469
|
+
# Try each alternative query
|
470
|
+
for alt_query in alt_queries[:3]: # Limit to first 3 alternatives
|
471
|
+
logger.info(f"Trying LLM-suggested query: {alt_query}")
|
472
|
+
alt_papers = self._search_papers(alt_query)
|
473
|
+
|
474
|
+
if alt_papers:
|
475
|
+
logger.info(f"Found {len(alt_papers)} papers using LLM-suggested query: {alt_query}")
|
476
|
+
strategy = "llm_alternative"
|
477
|
+
return alt_papers, strategy
|
478
|
+
except Exception as e:
|
479
|
+
logger.error(f"Error using LLM for query refinement: {e}")
|
480
|
+
# Fall through to simpler strategies
|
481
|
+
|
482
|
+
# Fallback 1: Try extracting important cancer-related terms
|
483
|
+
cancer_terms = ["cancer", "tumor", "oncology", "carcinoma", "sarcoma", "leukemia",
|
484
|
+
"lymphoma", "metastasis", "therapy", "immunotherapy", "targeted",
|
485
|
+
"treatment", "drug", "clinical", "trial", "biomarker"]
|
486
|
+
|
487
|
+
words = re.findall(r'\b\w+\b', query.lower())
|
488
|
+
important_terms = [word for word in words if word in cancer_terms or len(word) > 7]
|
489
|
+
|
490
|
+
if important_terms:
|
491
|
+
important_query = ' '.join(important_terms[:5]) # Limit to 5 terms
|
492
|
+
logger.info(f"Trying with important cancer terms: {important_query}")
|
493
|
+
papers = self._search_papers(important_query)
|
494
|
+
|
495
|
+
if papers:
|
496
|
+
strategy = "cancer_terms"
|
497
|
+
return papers, strategy
|
498
|
+
|
499
|
+
# Fallback 2: Try with just specific cancer types or treatment modalities
|
500
|
+
cancer_types = ["breast", "lung", "colorectal", "prostate", "melanoma", "lymphoma",
|
501
|
+
"leukemia", "myeloma", "sarcoma", "glioblastoma"]
|
502
|
+
treatment_types = ["immunotherapy", "chemotherapy", "radiotherapy", "targeted",
|
503
|
+
"surgery", "vaccine", "antibody", "CAR-T", "inhibitor"]
|
504
|
+
|
505
|
+
cancer_matches = [word for word in words if word in cancer_types]
|
506
|
+
treatment_matches = [word for word in words if word in treatment_types]
|
507
|
+
|
508
|
+
if cancer_matches and treatment_matches:
|
509
|
+
specific_query = f"{cancer_matches[0]} {treatment_matches[0]}"
|
510
|
+
logger.info(f"Trying with specific cancer-treatment pair: {specific_query}")
|
511
|
+
papers = self._search_papers(specific_query)
|
512
|
+
|
513
|
+
if papers:
|
514
|
+
strategy = "specific_pair"
|
515
|
+
return papers, strategy
|
516
|
+
|
517
|
+
# Fallback 3: Extract the longest word (likely a specific term)
|
518
|
+
longest_word = max(re.findall(r'\w+', query), key=len, default='')
|
519
|
+
if len(longest_word) > 6:
|
520
|
+
logger.info(f"Trying with primary keyword: {longest_word}")
|
521
|
+
papers = self._search_papers(longest_word)
|
522
|
+
|
523
|
+
if papers:
|
524
|
+
strategy = "primary_keyword"
|
525
|
+
return papers, strategy
|
526
|
+
|
527
|
+
return papers, strategy
|
528
|
+
|
529
|
+
|
530
|
+
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
531
|
+
"""
|
532
|
+
Get preview information for Semantic Scholar papers.
|
533
|
+
|
534
|
+
Args:
|
535
|
+
query: The search query
|
536
|
+
|
537
|
+
Returns:
|
538
|
+
List of preview dictionaries
|
539
|
+
"""
|
540
|
+
logger.info(f"Getting Semantic Scholar previews for query: {query}")
|
541
|
+
|
542
|
+
# Optimize the query if LLM is available
|
543
|
+
optimized_query = self._optimize_query(query)
|
544
|
+
|
545
|
+
# Perform adaptive search
|
546
|
+
papers, strategy = self._adaptive_search(optimized_query)
|
547
|
+
|
548
|
+
if not papers:
|
549
|
+
logger.warning(f"No Semantic Scholar results found using strategy: {strategy}")
|
550
|
+
return []
|
551
|
+
|
552
|
+
# Format as previews
|
553
|
+
previews = []
|
554
|
+
for paper in papers:
|
555
|
+
try:
|
556
|
+
# Format authors - ensure we have a valid list with string values
|
557
|
+
authors = []
|
558
|
+
if "authors" in paper and paper["authors"]:
|
559
|
+
authors = [author.get("name", "") for author in paper["authors"] if author and author.get("name")]
|
560
|
+
|
561
|
+
# Ensure we have valid strings for all fields
|
562
|
+
paper_id = paper.get("paperId", "")
|
563
|
+
title = paper.get("title", "")
|
564
|
+
url = paper.get("url", "")
|
565
|
+
|
566
|
+
# Handle abstract safely, ensuring we always have a string
|
567
|
+
abstract = paper.get("abstract")
|
568
|
+
snippet = ""
|
569
|
+
if abstract:
|
570
|
+
snippet = abstract[:250] + "..." if len(abstract) > 250 else abstract
|
571
|
+
|
572
|
+
venue = paper.get("venue", "")
|
573
|
+
year = paper.get("year")
|
574
|
+
external_ids = paper.get("externalIds", {})
|
575
|
+
|
576
|
+
# Handle TLDR safely
|
577
|
+
tldr_text = ""
|
578
|
+
if paper.get("tldr") and isinstance(paper.get("tldr"), dict):
|
579
|
+
tldr_text = paper.get("tldr", {}).get("text", "")
|
580
|
+
|
581
|
+
# Create preview with basic information, ensuring no None values
|
582
|
+
preview = {
|
583
|
+
"id": paper_id if paper_id else "",
|
584
|
+
"title": title if title else "",
|
585
|
+
"link": url if url else "",
|
586
|
+
"snippet": snippet, # Already handled above
|
587
|
+
"authors": authors, # List of strings, safe to use directly
|
588
|
+
"venue": venue if venue else "",
|
589
|
+
"year": year, # Can be None, handled in downstream processing
|
590
|
+
"external_ids": external_ids if external_ids else {},
|
591
|
+
"source": "Semantic Scholar",
|
592
|
+
"_paper_id": paper_id if paper_id else "",
|
593
|
+
"_search_strategy": strategy,
|
594
|
+
"tldr": tldr_text
|
595
|
+
}
|
596
|
+
|
597
|
+
# Store the full paper object for later reference
|
598
|
+
preview["_full_paper"] = paper
|
599
|
+
|
600
|
+
previews.append(preview)
|
601
|
+
except Exception as e:
|
602
|
+
logger.error(f"Error processing paper preview: {e}")
|
603
|
+
# Continue with the next paper
|
604
|
+
|
605
|
+
logger.info(f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}")
|
606
|
+
return previews
|
607
|
+
|
608
|
+
def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
609
|
+
"""
|
610
|
+
Get full content for the relevant Semantic Scholar papers.
|
611
|
+
Gets additional details like citations, references, and full metadata.
|
612
|
+
|
613
|
+
Args:
|
614
|
+
relevant_items: List of relevant preview dictionaries
|
615
|
+
|
616
|
+
Returns:
|
617
|
+
List of result dictionaries with full content
|
618
|
+
"""
|
619
|
+
# Check if we should add full content
|
620
|
+
if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
|
621
|
+
logger.info("Snippet-only mode, skipping full content retrieval")
|
622
|
+
return relevant_items
|
623
|
+
|
624
|
+
logger.info(f"Getting content for {len(relevant_items)} Semantic Scholar papers")
|
625
|
+
|
626
|
+
results = []
|
627
|
+
for item in relevant_items:
|
628
|
+
result = item.copy()
|
629
|
+
paper_id = item.get("_paper_id", "")
|
630
|
+
|
631
|
+
# Skip if no paper ID
|
632
|
+
if not paper_id:
|
633
|
+
results.append(result)
|
634
|
+
continue
|
635
|
+
|
636
|
+
# Get paper details if citations or references are requested
|
637
|
+
if self.get_citations or self.get_references or self.get_embeddings:
|
638
|
+
paper_details = self._get_paper_details(paper_id)
|
639
|
+
|
640
|
+
if paper_details:
|
641
|
+
# Add citation information
|
642
|
+
if self.get_citations and "citations" in paper_details:
|
643
|
+
result["citations"] = paper_details["citations"]
|
644
|
+
|
645
|
+
# Add reference information
|
646
|
+
if self.get_references and "references" in paper_details:
|
647
|
+
result["references"] = paper_details["references"]
|
648
|
+
|
649
|
+
# Add embedding if available
|
650
|
+
if self.get_embeddings and "embedding" in paper_details:
|
651
|
+
result["embedding"] = paper_details["embedding"]
|
652
|
+
|
653
|
+
# Add fields of study
|
654
|
+
if "fieldsOfStudy" in paper_details:
|
655
|
+
result["fields_of_study"] = paper_details["fieldsOfStudy"]
|
656
|
+
|
657
|
+
# Remove temporary fields
|
658
|
+
if "_paper_id" in result:
|
659
|
+
del result["_paper_id"]
|
660
|
+
if "_search_strategy" in result:
|
661
|
+
del result["_search_strategy"]
|
662
|
+
if "_full_paper" in result:
|
663
|
+
del result["_full_paper"]
|
664
|
+
|
665
|
+
results.append(result)
|
666
|
+
|
667
|
+
return results
|
668
|
+
|
669
|
+
def search_by_author(self, author_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
670
|
+
"""
|
671
|
+
Search for papers by a specific author.
|
672
|
+
|
673
|
+
Args:
|
674
|
+
author_name: Name of the author
|
675
|
+
max_results: Maximum number of results (defaults to self.max_results)
|
676
|
+
|
677
|
+
Returns:
|
678
|
+
List of papers by the author
|
679
|
+
"""
|
680
|
+
original_max_results = self.max_results
|
681
|
+
|
682
|
+
try:
|
683
|
+
if max_results:
|
684
|
+
self.max_results = max_results
|
685
|
+
|
686
|
+
# First search for the author
|
687
|
+
params = {
|
688
|
+
"query": author_name,
|
689
|
+
"limit": 5 # Limit to top 5 author matches
|
690
|
+
}
|
691
|
+
|
692
|
+
response = self._make_request(self.author_search_url, params)
|
693
|
+
|
694
|
+
if "data" not in response or not response["data"]:
|
695
|
+
logger.warning(f"No authors found matching: {author_name}")
|
696
|
+
return []
|
697
|
+
|
698
|
+
# Use the first (best) author match
|
699
|
+
author = response["data"][0]
|
700
|
+
author_id = author.get("authorId")
|
701
|
+
|
702
|
+
if not author_id:
|
703
|
+
logger.warning(f"No valid author ID found for: {author_name}")
|
704
|
+
return []
|
705
|
+
|
706
|
+
# Get the author's papers
|
707
|
+
fields = [
|
708
|
+
"papers.paperId",
|
709
|
+
"papers.title",
|
710
|
+
"papers.abstract",
|
711
|
+
"papers.venue",
|
712
|
+
"papers.year",
|
713
|
+
"papers.authors"
|
714
|
+
]
|
715
|
+
|
716
|
+
if self.get_tldr:
|
717
|
+
fields.append("papers.tldr")
|
718
|
+
|
719
|
+
url = f"{self.author_details_url}/{author_id}"
|
720
|
+
author_params = {
|
721
|
+
"fields": ",".join(fields)
|
722
|
+
}
|
723
|
+
|
724
|
+
author_data = self._make_request(url, author_params)
|
725
|
+
|
726
|
+
if "papers" not in author_data or not author_data["papers"]:
|
727
|
+
logger.warning(f"No papers found for author: {author_name}")
|
728
|
+
return []
|
729
|
+
|
730
|
+
# Format as paper results
|
731
|
+
papers = author_data["papers"][:self.max_results]
|
732
|
+
|
733
|
+
# Convert to standard results format
|
734
|
+
results = []
|
735
|
+
for paper in papers:
|
736
|
+
# Format authors
|
737
|
+
authors = []
|
738
|
+
if "authors" in paper and paper["authors"]:
|
739
|
+
authors = [author.get("name", "") for author in paper["authors"]]
|
740
|
+
|
741
|
+
result = {
|
742
|
+
"id": paper.get("paperId", ""),
|
743
|
+
"title": paper.get("title", ""),
|
744
|
+
"link": f"https://www.semanticscholar.org/paper/{paper.get('paperId', '')}",
|
745
|
+
"snippet": paper.get("abstract", "")[:250] + "..." if paper.get("abstract", "") and len(paper.get("abstract", "")) > 250 else paper.get("abstract", ""),
|
746
|
+
"authors": authors,
|
747
|
+
"venue": paper.get("venue", ""),
|
748
|
+
"year": paper.get("year"),
|
749
|
+
"source": "Semantic Scholar",
|
750
|
+
|
751
|
+
# Include TLDR if available
|
752
|
+
"tldr": paper.get("tldr", {}).get("text", "") if paper.get("tldr") else ""
|
753
|
+
}
|
754
|
+
|
755
|
+
results.append(result)
|
756
|
+
|
757
|
+
# Add citations and references if needed
|
758
|
+
if self.get_citations or self.get_references:
|
759
|
+
results = self._get_full_content(results)
|
760
|
+
|
761
|
+
return results
|
762
|
+
|
763
|
+
finally:
|
764
|
+
# Restore original value
|
765
|
+
self.max_results = original_max_results
|
766
|
+
|
767
|
+
def search_by_venue(self, venue_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
768
|
+
"""
|
769
|
+
Search for papers in a specific venue.
|
770
|
+
|
771
|
+
Args:
|
772
|
+
venue_name: Name of the venue (conference or journal)
|
773
|
+
max_results: Maximum number of results (defaults to self.max_results)
|
774
|
+
|
775
|
+
Returns:
|
776
|
+
List of papers from the venue
|
777
|
+
"""
|
778
|
+
original_max_results = self.max_results
|
779
|
+
|
780
|
+
try:
|
781
|
+
if max_results:
|
782
|
+
self.max_results = max_results
|
783
|
+
|
784
|
+
# Semantic Scholar doesn't have a dedicated venue search API
|
785
|
+
# So we search for papers with the venue in the query
|
786
|
+
query = f'venue:"{venue_name}"'
|
787
|
+
return self.run(query)
|
788
|
+
|
789
|
+
finally:
|
790
|
+
# Restore original value
|
791
|
+
self.max_results = original_max_results
|
792
|
+
|
793
|
+
def search_by_year(self, query: str, year: int, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
794
|
+
"""
|
795
|
+
Search for papers from a specific year matching the query.
|
796
|
+
|
797
|
+
Args:
|
798
|
+
query: The search query
|
799
|
+
year: Publication year
|
800
|
+
max_results: Maximum number of results (defaults to self.max_results)
|
801
|
+
|
802
|
+
Returns:
|
803
|
+
List of papers from the specified year matching the query
|
804
|
+
"""
|
805
|
+
original_max_results = self.max_results
|
806
|
+
original_year_range = self.year_range
|
807
|
+
|
808
|
+
try:
|
809
|
+
if max_results:
|
810
|
+
self.max_results = max_results
|
811
|
+
|
812
|
+
# Set year range for this search
|
813
|
+
self.year_range = (year, year)
|
814
|
+
|
815
|
+
return self.run(query)
|
816
|
+
|
817
|
+
finally:
|
818
|
+
# Restore original values
|
819
|
+
self.max_results = original_max_results
|
820
|
+
self.year_range = original_year_range
|
821
|
+
|
822
|
+
def search_by_field(self, query: str, field_of_study: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
823
|
+
"""
|
824
|
+
Search for papers in a specific field of study.
|
825
|
+
|
826
|
+
Args:
|
827
|
+
query: The search query
|
828
|
+
field_of_study: Field of study (e.g., "Computer Science", "Medicine")
|
829
|
+
max_results: Maximum number of results (defaults to self.max_results)
|
830
|
+
|
831
|
+
Returns:
|
832
|
+
List of papers in the specified field matching the query
|
833
|
+
"""
|
834
|
+
original_max_results = self.max_results
|
835
|
+
|
836
|
+
try:
|
837
|
+
if max_results:
|
838
|
+
self.max_results = max_results
|
839
|
+
|
840
|
+
# Add field of study to query
|
841
|
+
field_query = f'{query} fieldofstudy:"{field_of_study}"'
|
842
|
+
return self.run(field_query)
|
843
|
+
|
844
|
+
finally:
|
845
|
+
# Restore original value
|
846
|
+
self.max_results = original_max_results
|
847
|
+
|
848
|
+
def get_paper_by_id(self, paper_id: str) -> Dict[str, Any]:
|
849
|
+
"""
|
850
|
+
Get a specific paper by its Semantic Scholar ID.
|
851
|
+
|
852
|
+
Args:
|
853
|
+
paper_id: Semantic Scholar paper ID
|
854
|
+
|
855
|
+
Returns:
|
856
|
+
Dictionary with paper information
|
857
|
+
"""
|
858
|
+
paper_details = self._get_paper_details(paper_id)
|
859
|
+
|
860
|
+
if not paper_details:
|
861
|
+
return {}
|
862
|
+
|
863
|
+
# Format authors
|
864
|
+
authors = []
|
865
|
+
if "authors" in paper_details and paper_details["authors"]:
|
866
|
+
authors = [author.get("name", "") for author in paper_details["authors"]]
|
867
|
+
|
868
|
+
# Create formatted result
|
869
|
+
result = {
|
870
|
+
"id": paper_details.get("paperId", ""),
|
871
|
+
"title": paper_details.get("title", ""),
|
872
|
+
"link": paper_details.get("url", ""),
|
873
|
+
"abstract": paper_details.get("abstract", ""),
|
874
|
+
"authors": authors,
|
875
|
+
"venue": paper_details.get("venue", ""),
|
876
|
+
"year": paper_details.get("year"),
|
877
|
+
"fields_of_study": paper_details.get("fieldsOfStudy", []),
|
878
|
+
"external_ids": paper_details.get("externalIds", {}),
|
879
|
+
"source": "Semantic Scholar",
|
880
|
+
|
881
|
+
# Include TLDR if available
|
882
|
+
"tldr": paper_details.get("tldr", {}).get("text", "") if paper_details.get("tldr") else ""
|
883
|
+
}
|
884
|
+
|
885
|
+
# Add citations and references if requested
|
886
|
+
if self.get_citations and "citations" in paper_details:
|
887
|
+
result["citations"] = paper_details["citations"]
|
888
|
+
|
889
|
+
if self.get_references and "references" in paper_details:
|
890
|
+
result["references"] = paper_details["references"]
|
891
|
+
|
892
|
+
# Add embedding if requested
|
893
|
+
if self.get_embeddings and "embedding" in paper_details:
|
894
|
+
result["embedding"] = paper_details["embedding"]
|
895
|
+
|
896
|
+
return result
|
897
|
+
|
898
|
+
def get_paper_by_doi(self, doi: str) -> Dict[str, Any]:
|
899
|
+
"""
|
900
|
+
Get a paper by its DOI.
|
901
|
+
|
902
|
+
Args:
|
903
|
+
doi: Digital Object Identifier
|
904
|
+
|
905
|
+
Returns:
|
906
|
+
Dictionary with paper information
|
907
|
+
"""
|
908
|
+
try:
|
909
|
+
# The Semantic Scholar API supports DOI lookup
|
910
|
+
url = f"{self.paper_details_url}/DOI:{doi}"
|
911
|
+
fields = [
|
912
|
+
"paperId",
|
913
|
+
"externalIds",
|
914
|
+
"url",
|
915
|
+
"title",
|
916
|
+
"abstract",
|
917
|
+
"venue",
|
918
|
+
"year",
|
919
|
+
"authors",
|
920
|
+
"fieldsOfStudy"
|
921
|
+
]
|
922
|
+
|
923
|
+
if self.get_tldr:
|
924
|
+
fields.append("tldr")
|
925
|
+
|
926
|
+
if self.get_embeddings:
|
927
|
+
fields.append("embedding")
|
928
|
+
|
929
|
+
# Add citation and reference fields if requested
|
930
|
+
if self.get_citations:
|
931
|
+
fields.append(f"citations.limit({self.citation_limit})")
|
932
|
+
|
933
|
+
if self.get_references:
|
934
|
+
fields.append(f"references.limit({self.reference_limit})")
|
935
|
+
|
936
|
+
params = {"fields": ",".join(fields)}
|
937
|
+
paper_details = self._make_request(url, params)
|
938
|
+
|
939
|
+
if not paper_details:
|
940
|
+
return {}
|
941
|
+
|
942
|
+
# Format the paper info the same way as get_paper_by_id
|
943
|
+
# Format authors
|
944
|
+
authors = []
|
945
|
+
if "authors" in paper_details and paper_details["authors"]:
|
946
|
+
authors = [author.get("name", "") for author in paper_details["authors"]]
|
947
|
+
|
948
|
+
# Create formatted result
|
949
|
+
result = {
|
950
|
+
"id": paper_details.get("paperId", ""),
|
951
|
+
"title": paper_details.get("title", ""),
|
952
|
+
"link": paper_details.get("url", ""),
|
953
|
+
"abstract": paper_details.get("abstract", ""),
|
954
|
+
"authors": authors,
|
955
|
+
"venue": paper_details.get("venue", ""),
|
956
|
+
"year": paper_details.get("year"),
|
957
|
+
"fields_of_study": paper_details.get("fieldsOfStudy", []),
|
958
|
+
"external_ids": paper_details.get("externalIds", {}),
|
959
|
+
"source": "Semantic Scholar",
|
960
|
+
|
961
|
+
# Include TLDR if available
|
962
|
+
"tldr": paper_details.get("tldr", {}).get("text", "") if paper_details.get("tldr") else ""
|
963
|
+
}
|
964
|
+
|
965
|
+
# Add citations and references if requested
|
966
|
+
if self.get_citations and "citations" in paper_details:
|
967
|
+
result["citations"] = paper_details["citations"]
|
968
|
+
|
969
|
+
if self.get_references and "references" in paper_details:
|
970
|
+
result["references"] = paper_details["references"]
|
971
|
+
|
972
|
+
# Add embedding if requested
|
973
|
+
if self.get_embeddings and "embedding" in paper_details:
|
974
|
+
result["embedding"] = paper_details["embedding"]
|
975
|
+
|
976
|
+
return result
|
977
|
+
|
978
|
+
except Exception as e:
|
979
|
+
logger.error(f"Error getting paper by DOI {doi}: {e}")
|
980
|
+
return {}
|
981
|
+
|
982
|
+
def get_papers_batch(self, paper_ids: List[str], fields: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
983
|
+
"""
|
984
|
+
Get details for multiple papers in a single batch request.
|
985
|
+
|
986
|
+
Args:
|
987
|
+
paper_ids: List of paper IDs (Semantic Scholar IDs, DOIs, arXiv IDs, etc.)
|
988
|
+
fields: Fields to include in the response
|
989
|
+
|
990
|
+
Returns:
|
991
|
+
List of paper details
|
992
|
+
"""
|
993
|
+
if not paper_ids:
|
994
|
+
return []
|
995
|
+
|
996
|
+
if fields is None:
|
997
|
+
fields = [
|
998
|
+
"paperId",
|
999
|
+
"externalIds",
|
1000
|
+
"url",
|
1001
|
+
"title",
|
1002
|
+
"abstract",
|
1003
|
+
"venue",
|
1004
|
+
"year",
|
1005
|
+
"authors",
|
1006
|
+
"referenceCount",
|
1007
|
+
"citationCount"
|
1008
|
+
]
|
1009
|
+
|
1010
|
+
if self.get_tldr:
|
1011
|
+
fields.append("tldr")
|
1012
|
+
|
1013
|
+
try:
|
1014
|
+
# Construct request params
|
1015
|
+
params = {
|
1016
|
+
"fields": ",".join(fields)
|
1017
|
+
}
|
1018
|
+
|
1019
|
+
# Make POST request with paper IDs in the body
|
1020
|
+
response = self._make_request(
|
1021
|
+
self.paper_batch_url,
|
1022
|
+
params=params,
|
1023
|
+
data={"ids": paper_ids},
|
1024
|
+
method="POST"
|
1025
|
+
)
|
1026
|
+
|
1027
|
+
if isinstance(response, list):
|
1028
|
+
return response
|
1029
|
+
else:
|
1030
|
+
logger.warning("Unexpected response format from batch API")
|
1031
|
+
return []
|
1032
|
+
|
1033
|
+
except Exception as e:
|
1034
|
+
logger.error(f"Error in batch paper lookup: {e}")
|
1035
|
+
return []
|
1036
|
+
|
1037
|
+
def get_paper_recommendations(self,
|
1038
|
+
positive_paper_ids: List[str],
|
1039
|
+
negative_paper_ids: Optional[List[str]] = None,
|
1040
|
+
max_results: Optional[int] = None) -> List[Dict[str, Any]]:
|
1041
|
+
"""
|
1042
|
+
Get recommended papers based on positive and negative examples.
|
1043
|
+
|
1044
|
+
Args:
|
1045
|
+
positive_paper_ids: List of paper IDs to use as positive examples
|
1046
|
+
negative_paper_ids: Optional list of paper IDs to use as negative examples
|
1047
|
+
max_results: Maximum number of recommendations to return
|
1048
|
+
|
1049
|
+
Returns:
|
1050
|
+
List of recommended papers
|
1051
|
+
"""
|
1052
|
+
if not positive_paper_ids:
|
1053
|
+
return []
|
1054
|
+
|
1055
|
+
limit = max_results or self.max_results
|
1056
|
+
|
1057
|
+
try:
|
1058
|
+
# Construct the request payload
|
1059
|
+
payload = {
|
1060
|
+
"positivePaperIds": positive_paper_ids
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
if negative_paper_ids:
|
1064
|
+
payload["negativePaperIds"] = negative_paper_ids
|
1065
|
+
|
1066
|
+
# Define fields to include in the response
|
1067
|
+
fields = [
|
1068
|
+
"paperId",
|
1069
|
+
"externalIds",
|
1070
|
+
"url",
|
1071
|
+
"title",
|
1072
|
+
"abstract",
|
1073
|
+
"venue",
|
1074
|
+
"year",
|
1075
|
+
"authors"
|
1076
|
+
]
|
1077
|
+
|
1078
|
+
if self.get_tldr:
|
1079
|
+
fields.append("tldr")
|
1080
|
+
|
1081
|
+
# Request parameters
|
1082
|
+
params = {
|
1083
|
+
"fields": ",".join(fields),
|
1084
|
+
"limit": limit
|
1085
|
+
}
|
1086
|
+
|
1087
|
+
# Make POST request to recommendations endpoint
|
1088
|
+
response = self._make_request(
|
1089
|
+
self.recommendations_url,
|
1090
|
+
params=params,
|
1091
|
+
data=payload,
|
1092
|
+
method="POST"
|
1093
|
+
)
|
1094
|
+
|
1095
|
+
if "recommendedPapers" not in response:
|
1096
|
+
return []
|
1097
|
+
|
1098
|
+
papers = response["recommendedPapers"]
|
1099
|
+
|
1100
|
+
# Format as standard results
|
1101
|
+
results = []
|
1102
|
+
for paper in papers:
|
1103
|
+
# Format authors
|
1104
|
+
authors = []
|
1105
|
+
if "authors" in paper and paper["authors"]:
|
1106
|
+
authors = [author.get("name", "") for author in paper["authors"]]
|
1107
|
+
|
1108
|
+
result = {
|
1109
|
+
"id": paper.get("paperId", ""),
|
1110
|
+
"title": paper.get("title", ""),
|
1111
|
+
"link": paper.get("url", ""),
|
1112
|
+
"snippet": paper.get("abstract", "")[:250] + "..." if paper.get("abstract", "") and len(paper.get("abstract", "")) > 250 else paper.get("abstract", ""),
|
1113
|
+
"authors": authors,
|
1114
|
+
"venue": paper.get("venue", ""),
|
1115
|
+
"year": paper.get("year"),
|
1116
|
+
"source": "Semantic Scholar",
|
1117
|
+
|
1118
|
+
# Include TLDR if available
|
1119
|
+
"tldr": paper.get("tldr", {}).get("text", "") if paper.get("tldr") else ""
|
1120
|
+
}
|
1121
|
+
|
1122
|
+
results.append(result)
|
1123
|
+
|
1124
|
+
return results
|
1125
|
+
|
1126
|
+
except Exception as e:
|
1127
|
+
logger.error(f"Error getting paper recommendations: {e}")
|
1128
|
+
return []
|